diff --git a/metallama3_8b/limo/README.md b/metallama3_8b/limo/README.md deleted file mode 100644 index dcffddb30c6519d3290d3cd5e61bc41d3d99d917..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/README.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -library_name: transformers -license: other -base_model: meta-llama/Meta-Llama-3-8B-Instruct -tags: -- llama-factory -- full -- generated_from_trainer -model-index: -- name: limo - results: [] ---- - - - -# limo - -This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the limo dataset. - -## Model description - -More information needed - -## Intended uses & limitations - -More information needed - -## Training and evaluation data - -More information needed - -## Training procedure - -### Training hyperparameters - -The following hyperparameters were used during training: -- learning_rate: 5e-06 -- train_batch_size: 1 -- eval_batch_size: 8 -- seed: 42 -- distributed_type: multi-GPU -- num_devices: 4 -- total_train_batch_size: 4 -- total_eval_batch_size: 32 -- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments -- lr_scheduler_type: cosine -- num_epochs: 10 - -### Training results - - - -### Framework versions - -- Transformers 4.55.0 -- Pytorch 2.5.1+cu124 -- Datasets 3.6.0 -- Tokenizers 0.21.1 diff --git a/metallama3_8b/limo/all_results.json b/metallama3_8b/limo/all_results.json deleted file mode 100644 index e90c777551753b9357fb0a16c66657ba945482d0..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/all_results.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "epoch": 10.0, - "total_flos": 5.892331269877924e+17, - "train_loss": 0.2632100873960966, - "train_runtime": 9760.8256, - "train_samples_per_second": 0.837, - "train_steps_per_second": 0.21 -} \ No newline at end of file diff --git a/metallama3_8b/limo/chat_template.jinja b/metallama3_8b/limo/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-1025/chat_template.jinja b/metallama3_8b/limo/checkpoint-1025/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-1025/config.json b/metallama3_8b/limo/checkpoint-1025/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-1025/generation_config.json b/metallama3_8b/limo/checkpoint-1025/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-1025/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1025/model-00001-of-00007.safetensors deleted file mode 100644 index b8c92fbb574e095889d6215d76f0ada19317acee..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a7f65d34b3cf46a6592007a589026b2dbab3189d6e2318b5ae28ec891fe5d13e -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-1025/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1025/model-00002-of-00007.safetensors deleted file mode 100644 index 37578ea5f76c73c3cf361bbd0d92c1a18348723a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75553d12f8390c2a81a05fd8af43b2ca340a29b189a7bf304865fcf349ba7557 -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-1025/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1025/model-00003-of-00007.safetensors deleted file mode 100644 index 11d44f3215c0ee2fbe72a31130853482ec5ab6ea..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e94289fc3c9d7ac8e72adbb345bef6f068bc445e40f54a3e8807d9926af3b83d -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-1025/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1025/model-00004-of-00007.safetensors deleted file mode 100644 index a5c9355cf82ca790ecb1b04da4d93b5b03c61d4c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:325301dc3089b2fb13962fa66115a1e0a3bf5519b6dd0f85a6fb909e540dba74 -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-1025/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1025/model-00005-of-00007.safetensors deleted file mode 100644 index b586bd94fe1dac272ea933a93839afb2e1618742..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c3c79bebf46b04ce88c59c82de366a45e135ed595aef444a0e0192da74a425c -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-1025/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1025/model-00006-of-00007.safetensors deleted file mode 100644 index 278b40d6283e58076dce4c10ae25107bcf886309..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b5ffde91b39b7d4a8ef427b0713e5c279b1cce9b74beb2a899a34f4bd186fd2 -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-1025/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1025/model-00007-of-00007.safetensors deleted file mode 100644 index b3c1b76cb1a931a06b2ba50534caedf94cfb6563..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c9548ea43461b5f9816535a812c388f6d42794893c1820466e77856774439fd -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-1025/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-1025/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-1025/rng_state_0.pth b/metallama3_8b/limo/checkpoint-1025/rng_state_0.pth deleted file mode 100644 index 3fb9a88bbbee1d828823dc0792895d385b4be47e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c5e18f922d0af74d820247ae97bee506ab412554a58345ddf2558abc94ee3e3 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1025/rng_state_1.pth b/metallama3_8b/limo/checkpoint-1025/rng_state_1.pth deleted file mode 100644 index cc3d4a3c6ff4b588e0b24552f5cc78610d1a3f42..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a2dcca6d9741f46592359768ea2212b9321da6408d1fd7d3a80b017bf37f434 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1025/rng_state_2.pth b/metallama3_8b/limo/checkpoint-1025/rng_state_2.pth deleted file mode 100644 index 0ea7e83be3a9fc39999b7084bcf14ba0f491317b..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69420ece2c255923c5cbb3c6c9c4a6b9cb38fb57e5d3033c8b7d436a1faf6f13 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1025/rng_state_3.pth b/metallama3_8b/limo/checkpoint-1025/rng_state_3.pth deleted file mode 100644 index 88e70a1e21ef6d40a7016a6221703385b6c1cdc6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66f278b40a1e23b88a657c4e5d03afa8dbbbe14dfeb16f6b4beedaece6cdd0b9 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1025/scheduler.pt b/metallama3_8b/limo/checkpoint-1025/scheduler.pt deleted file mode 100644 index 221979567912f868d7f97f636721e0bda942ae7a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f248cf8d7232a4d32a1c7a253ea7a98e8d245b7a745ec212e824a40ae88102c3 -size 1064 diff --git a/metallama3_8b/limo/checkpoint-1025/special_tokens_map.json b/metallama3_8b/limo/checkpoint-1025/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-1025/tokenizer.json b/metallama3_8b/limo/checkpoint-1025/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-1025/tokenizer_config.json b/metallama3_8b/limo/checkpoint-1025/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-1025/trainer_state.json b/metallama3_8b/limo/checkpoint-1025/trainer_state.json deleted file mode 100644 index 25217fd6069a982d36ceb38bfb3b1d29639da00b..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1025/trainer_state.json +++ /dev/null @@ -1,7209 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 5.0, - "eval_steps": 500, - "global_step": 1025, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - }, - { - "epoch": 3.004878048780488, - "grad_norm": 2.787429094314575, - "learning_rate": 3.969463130731183e-06, - "loss": 0.2403, - "step": 616 - }, - { - "epoch": 3.0097560975609756, - "grad_norm": 3.0412075519561768, - "learning_rate": 3.966361891975316e-06, - "loss": 0.2635, - "step": 617 - }, - { - "epoch": 3.0146341463414634, - "grad_norm": 2.9949851036071777, - "learning_rate": 3.963257209457652e-06, - "loss": 0.3294, - "step": 618 - }, - { - "epoch": 3.0195121951219512, - "grad_norm": 3.0510809421539307, - "learning_rate": 3.960149090469561e-06, - "loss": 0.1338, - "step": 619 - }, - { - "epoch": 3.024390243902439, - "grad_norm": 3.669482707977295, - "learning_rate": 3.957037542310484e-06, - "loss": 0.1469, - "step": 620 - }, - { - "epoch": 3.029268292682927, - "grad_norm": 4.677116870880127, - "learning_rate": 3.953922572287915e-06, - "loss": 0.2788, - "step": 621 - }, - { - "epoch": 3.0341463414634147, - "grad_norm": 4.33144474029541, - "learning_rate": 3.950804187717384e-06, - "loss": 0.4521, - "step": 622 - }, - { - "epoch": 3.0390243902439025, - "grad_norm": 3.466639757156372, - "learning_rate": 3.947682395922439e-06, - "loss": 0.5113, - "step": 623 - }, - { - "epoch": 3.0439024390243903, - "grad_norm": 3.2332122325897217, - "learning_rate": 3.9445572042346346e-06, - "loss": 0.0968, - "step": 624 - }, - { - "epoch": 3.048780487804878, - "grad_norm": 2.6108055114746094, - "learning_rate": 3.941428619993505e-06, - "loss": 0.2462, - "step": 625 - }, - { - "epoch": 3.053658536585366, - "grad_norm": 3.2512595653533936, - "learning_rate": 3.938296650546552e-06, - "loss": 0.1782, - "step": 626 - }, - { - "epoch": 3.0585365853658537, - "grad_norm": 3.4350366592407227, - "learning_rate": 3.935161303249231e-06, - "loss": 0.2955, - "step": 627 - }, - { - "epoch": 3.0634146341463415, - "grad_norm": 3.42012619972229, - "learning_rate": 3.932022585464928e-06, - "loss": 0.3259, - "step": 628 - }, - { - "epoch": 3.0682926829268293, - "grad_norm": 3.458043336868286, - "learning_rate": 3.928880504564943e-06, - "loss": 0.2306, - "step": 629 - }, - { - "epoch": 3.073170731707317, - "grad_norm": 2.646616220474243, - "learning_rate": 3.92573506792848e-06, - "loss": 0.2197, - "step": 630 - }, - { - "epoch": 3.078048780487805, - "grad_norm": 3.5558857917785645, - "learning_rate": 3.9225862829426184e-06, - "loss": 0.1607, - "step": 631 - }, - { - "epoch": 3.0829268292682928, - "grad_norm": 3.6011338233947754, - "learning_rate": 3.919434157002303e-06, - "loss": 0.3087, - "step": 632 - }, - { - "epoch": 3.0878048780487806, - "grad_norm": 2.339879035949707, - "learning_rate": 3.916278697510325e-06, - "loss": 0.2213, - "step": 633 - }, - { - "epoch": 3.0926829268292684, - "grad_norm": 3.268162488937378, - "learning_rate": 3.913119911877305e-06, - "loss": 0.318, - "step": 634 - }, - { - "epoch": 3.097560975609756, - "grad_norm": 4.062571048736572, - "learning_rate": 3.909957807521674e-06, - "loss": 0.1757, - "step": 635 - }, - { - "epoch": 3.102439024390244, - "grad_norm": 2.997659683227539, - "learning_rate": 3.906792391869657e-06, - "loss": 0.2391, - "step": 636 - }, - { - "epoch": 3.107317073170732, - "grad_norm": 3.7037394046783447, - "learning_rate": 3.903623672355258e-06, - "loss": 0.2548, - "step": 637 - }, - { - "epoch": 3.1121951219512196, - "grad_norm": 3.110579252243042, - "learning_rate": 3.900451656420237e-06, - "loss": 0.2389, - "step": 638 - }, - { - "epoch": 3.1170731707317074, - "grad_norm": 3.3332321643829346, - "learning_rate": 3.897276351514097e-06, - "loss": 0.1371, - "step": 639 - }, - { - "epoch": 3.1219512195121952, - "grad_norm": 3.8275935649871826, - "learning_rate": 3.894097765094065e-06, - "loss": 0.3363, - "step": 640 - }, - { - "epoch": 3.126829268292683, - "grad_norm": 2.3731374740600586, - "learning_rate": 3.890915904625075e-06, - "loss": 0.1314, - "step": 641 - }, - { - "epoch": 3.131707317073171, - "grad_norm": 3.1511282920837402, - "learning_rate": 3.887730777579751e-06, - "loss": 0.3563, - "step": 642 - }, - { - "epoch": 3.1365853658536587, - "grad_norm": 4.2254862785339355, - "learning_rate": 3.884542391438387e-06, - "loss": 0.5053, - "step": 643 - }, - { - "epoch": 3.1414634146341465, - "grad_norm": 4.579670429229736, - "learning_rate": 3.88135075368893e-06, - "loss": 0.6259, - "step": 644 - }, - { - "epoch": 3.1463414634146343, - "grad_norm": 3.2102746963500977, - "learning_rate": 3.878155871826968e-06, - "loss": 0.2599, - "step": 645 - }, - { - "epoch": 3.151219512195122, - "grad_norm": 2.5569686889648438, - "learning_rate": 3.874957753355701e-06, - "loss": 0.2075, - "step": 646 - }, - { - "epoch": 3.15609756097561, - "grad_norm": 3.588925838470459, - "learning_rate": 3.8717564057859365e-06, - "loss": 0.4577, - "step": 647 - }, - { - "epoch": 3.1609756097560977, - "grad_norm": 3.6163878440856934, - "learning_rate": 3.868551836636063e-06, - "loss": 0.4023, - "step": 648 - }, - { - "epoch": 3.1658536585365855, - "grad_norm": 3.8688390254974365, - "learning_rate": 3.865344053432035e-06, - "loss": 0.1669, - "step": 649 - }, - { - "epoch": 3.1707317073170733, - "grad_norm": 3.419734001159668, - "learning_rate": 3.862133063707353e-06, - "loss": 0.2766, - "step": 650 - }, - { - "epoch": 3.175609756097561, - "grad_norm": 2.9860243797302246, - "learning_rate": 3.858918875003053e-06, - "loss": 0.1788, - "step": 651 - }, - { - "epoch": 3.180487804878049, - "grad_norm": 3.0619022846221924, - "learning_rate": 3.855701494867679e-06, - "loss": 0.224, - "step": 652 - }, - { - "epoch": 3.1853658536585368, - "grad_norm": 3.3668978214263916, - "learning_rate": 3.852480930857275e-06, - "loss": 0.4029, - "step": 653 - }, - { - "epoch": 3.1902439024390246, - "grad_norm": 3.543147563934326, - "learning_rate": 3.849257190535356e-06, - "loss": 0.2096, - "step": 654 - }, - { - "epoch": 3.1951219512195124, - "grad_norm": 3.793619155883789, - "learning_rate": 3.846030281472902e-06, - "loss": 0.5574, - "step": 655 - }, - { - "epoch": 3.2, - "grad_norm": 3.021289110183716, - "learning_rate": 3.842800211248333e-06, - "loss": 0.2233, - "step": 656 - }, - { - "epoch": 3.204878048780488, - "grad_norm": 4.582934856414795, - "learning_rate": 3.839566987447492e-06, - "loss": 0.3871, - "step": 657 - }, - { - "epoch": 3.209756097560976, - "grad_norm": 2.996340274810791, - "learning_rate": 3.8363306176636296e-06, - "loss": 0.4325, - "step": 658 - }, - { - "epoch": 3.2146341463414636, - "grad_norm": 3.3190877437591553, - "learning_rate": 3.833091109497384e-06, - "loss": 0.5321, - "step": 659 - }, - { - "epoch": 3.2195121951219514, - "grad_norm": 3.2532856464385986, - "learning_rate": 3.829848470556765e-06, - "loss": 0.1359, - "step": 660 - }, - { - "epoch": 3.2243902439024392, - "grad_norm": 2.7875044345855713, - "learning_rate": 3.8266027084571335e-06, - "loss": 0.3145, - "step": 661 - }, - { - "epoch": 3.229268292682927, - "grad_norm": 3.748253583908081, - "learning_rate": 3.823353830821187e-06, - "loss": 0.1252, - "step": 662 - }, - { - "epoch": 3.234146341463415, - "grad_norm": 2.858293294906616, - "learning_rate": 3.820101845278937e-06, - "loss": 0.2589, - "step": 663 - }, - { - "epoch": 3.2390243902439027, - "grad_norm": 3.7470967769622803, - "learning_rate": 3.816846759467696e-06, - "loss": 0.2594, - "step": 664 - }, - { - "epoch": 3.2439024390243905, - "grad_norm": 3.676196813583374, - "learning_rate": 3.8135885810320587e-06, - "loss": 0.2998, - "step": 665 - }, - { - "epoch": 3.2487804878048783, - "grad_norm": 3.0943140983581543, - "learning_rate": 3.810327317623881e-06, - "loss": 0.2238, - "step": 666 - }, - { - "epoch": 3.253658536585366, - "grad_norm": 3.5907349586486816, - "learning_rate": 3.8070629769022628e-06, - "loss": 0.3381, - "step": 667 - }, - { - "epoch": 3.258536585365854, - "grad_norm": 3.1195285320281982, - "learning_rate": 3.8037955665335335e-06, - "loss": 0.2407, - "step": 668 - }, - { - "epoch": 3.2634146341463417, - "grad_norm": 3.422292947769165, - "learning_rate": 3.800525094191231e-06, - "loss": 0.2957, - "step": 669 - }, - { - "epoch": 3.2682926829268295, - "grad_norm": 2.5264663696289062, - "learning_rate": 3.797251567556083e-06, - "loss": 0.2493, - "step": 670 - }, - { - "epoch": 3.2731707317073173, - "grad_norm": 3.350219964981079, - "learning_rate": 3.793974994315991e-06, - "loss": 0.1186, - "step": 671 - }, - { - "epoch": 3.278048780487805, - "grad_norm": 4.175906181335449, - "learning_rate": 3.790695382166013e-06, - "loss": 0.3453, - "step": 672 - }, - { - "epoch": 3.2829268292682925, - "grad_norm": 3.006072521209717, - "learning_rate": 3.7874127388083415e-06, - "loss": 0.1981, - "step": 673 - }, - { - "epoch": 3.2878048780487803, - "grad_norm": 3.368561029434204, - "learning_rate": 3.7841270719522895e-06, - "loss": 0.2934, - "step": 674 - }, - { - "epoch": 3.292682926829268, - "grad_norm": 4.374331951141357, - "learning_rate": 3.7808383893142692e-06, - "loss": 0.1359, - "step": 675 - }, - { - "epoch": 3.297560975609756, - "grad_norm": 3.297102451324463, - "learning_rate": 3.7775466986177763e-06, - "loss": 0.2498, - "step": 676 - }, - { - "epoch": 3.3024390243902437, - "grad_norm": 2.8914761543273926, - "learning_rate": 3.774252007593371e-06, - "loss": 0.1308, - "step": 677 - }, - { - "epoch": 3.3073170731707315, - "grad_norm": 3.1550722122192383, - "learning_rate": 3.7709543239786593e-06, - "loss": 0.3915, - "step": 678 - }, - { - "epoch": 3.3121951219512193, - "grad_norm": 3.2302658557891846, - "learning_rate": 3.767653655518277e-06, - "loss": 0.2558, - "step": 679 - }, - { - "epoch": 3.317073170731707, - "grad_norm": 4.4321770668029785, - "learning_rate": 3.7643500099638673e-06, - "loss": 0.1988, - "step": 680 - }, - { - "epoch": 3.321951219512195, - "grad_norm": 2.970566749572754, - "learning_rate": 3.7610433950740667e-06, - "loss": 0.4908, - "step": 681 - }, - { - "epoch": 3.3268292682926828, - "grad_norm": 3.5516228675842285, - "learning_rate": 3.757733818614485e-06, - "loss": 0.304, - "step": 682 - }, - { - "epoch": 3.3317073170731706, - "grad_norm": 2.7555387020111084, - "learning_rate": 3.7544212883576856e-06, - "loss": 0.2533, - "step": 683 - }, - { - "epoch": 3.3365853658536584, - "grad_norm": 3.61226749420166, - "learning_rate": 3.751105812083172e-06, - "loss": 0.1771, - "step": 684 - }, - { - "epoch": 3.341463414634146, - "grad_norm": 3.0466206073760986, - "learning_rate": 3.7477873975773655e-06, - "loss": 0.4213, - "step": 685 - }, - { - "epoch": 3.346341463414634, - "grad_norm": 3.6091527938842773, - "learning_rate": 3.7444660526335853e-06, - "loss": 0.3808, - "step": 686 - }, - { - "epoch": 3.351219512195122, - "grad_norm": 3.8443002700805664, - "learning_rate": 3.741141785052036e-06, - "loss": 0.6438, - "step": 687 - }, - { - "epoch": 3.3560975609756096, - "grad_norm": 3.845909833908081, - "learning_rate": 3.737814602639784e-06, - "loss": 0.3686, - "step": 688 - }, - { - "epoch": 3.3609756097560974, - "grad_norm": 2.904892921447754, - "learning_rate": 3.7344845132107427e-06, - "loss": 0.2934, - "step": 689 - }, - { - "epoch": 3.3658536585365852, - "grad_norm": 3.4766387939453125, - "learning_rate": 3.731151524585651e-06, - "loss": 0.3299, - "step": 690 - }, - { - "epoch": 3.370731707317073, - "grad_norm": 4.236767768859863, - "learning_rate": 3.7278156445920584e-06, - "loss": 0.6303, - "step": 691 - }, - { - "epoch": 3.375609756097561, - "grad_norm": 3.1122591495513916, - "learning_rate": 3.724476881064303e-06, - "loss": 0.2432, - "step": 692 - }, - { - "epoch": 3.3804878048780487, - "grad_norm": 3.0971457958221436, - "learning_rate": 3.721135241843496e-06, - "loss": 0.3131, - "step": 693 - }, - { - "epoch": 3.3853658536585365, - "grad_norm": 3.9365804195404053, - "learning_rate": 3.7177907347775016e-06, - "loss": 0.3372, - "step": 694 - }, - { - "epoch": 3.3902439024390243, - "grad_norm": 3.760373115539551, - "learning_rate": 3.71444336772092e-06, - "loss": 0.5055, - "step": 695 - }, - { - "epoch": 3.395121951219512, - "grad_norm": 4.360848426818848, - "learning_rate": 3.711093148535068e-06, - "loss": 0.6183, - "step": 696 - }, - { - "epoch": 3.4, - "grad_norm": 3.7713537216186523, - "learning_rate": 3.707740085087959e-06, - "loss": 0.1568, - "step": 697 - }, - { - "epoch": 3.4048780487804877, - "grad_norm": 3.8532230854034424, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.2826, - "step": 698 - }, - { - "epoch": 3.4097560975609755, - "grad_norm": 3.0548605918884277, - "learning_rate": 3.701025456915411e-06, - "loss": 0.1918, - "step": 699 - }, - { - "epoch": 3.4146341463414633, - "grad_norm": 3.2431821823120117, - "learning_rate": 3.697663907959327e-06, - "loss": 0.2493, - "step": 700 - }, - { - "epoch": 3.419512195121951, - "grad_norm": 3.7301864624023438, - "learning_rate": 3.6942995462806574e-06, - "loss": 0.4913, - "step": 701 - }, - { - "epoch": 3.424390243902439, - "grad_norm": 2.5468900203704834, - "learning_rate": 3.6909323797806314e-06, - "loss": 0.1788, - "step": 702 - }, - { - "epoch": 3.4292682926829268, - "grad_norm": 3.3719515800476074, - "learning_rate": 3.6875624163670635e-06, - "loss": 0.4162, - "step": 703 - }, - { - "epoch": 3.4341463414634146, - "grad_norm": 3.528010368347168, - "learning_rate": 3.6841896639543394e-06, - "loss": 0.1924, - "step": 704 - }, - { - "epoch": 3.4390243902439024, - "grad_norm": 3.3636631965637207, - "learning_rate": 3.6808141304633924e-06, - "loss": 0.3177, - "step": 705 - }, - { - "epoch": 3.44390243902439, - "grad_norm": 3.418705463409424, - "learning_rate": 3.6774358238216878e-06, - "loss": 0.2301, - "step": 706 - }, - { - "epoch": 3.448780487804878, - "grad_norm": 4.720373630523682, - "learning_rate": 3.6740547519632048e-06, - "loss": 0.1894, - "step": 707 - }, - { - "epoch": 3.453658536585366, - "grad_norm": 2.9635703563690186, - "learning_rate": 3.670670922828414e-06, - "loss": 0.2642, - "step": 708 - }, - { - "epoch": 3.4585365853658536, - "grad_norm": 4.934754371643066, - "learning_rate": 3.667284344364264e-06, - "loss": 0.2275, - "step": 709 - }, - { - "epoch": 3.4634146341463414, - "grad_norm": 3.090585231781006, - "learning_rate": 3.6638950245241604e-06, - "loss": 0.4447, - "step": 710 - }, - { - "epoch": 3.4682926829268292, - "grad_norm": 4.360495090484619, - "learning_rate": 3.660502971267945e-06, - "loss": 0.2415, - "step": 711 - }, - { - "epoch": 3.473170731707317, - "grad_norm": 3.4893476963043213, - "learning_rate": 3.65710819256188e-06, - "loss": 0.0921, - "step": 712 - }, - { - "epoch": 3.478048780487805, - "grad_norm": 3.2423770427703857, - "learning_rate": 3.65371069637863e-06, - "loss": 0.2371, - "step": 713 - }, - { - "epoch": 3.4829268292682927, - "grad_norm": 3.0775890350341797, - "learning_rate": 3.650310490697238e-06, - "loss": 0.4026, - "step": 714 - }, - { - "epoch": 3.4878048780487805, - "grad_norm": 3.906625270843506, - "learning_rate": 3.646907583503114e-06, - "loss": 0.4312, - "step": 715 - }, - { - "epoch": 3.4926829268292683, - "grad_norm": 3.2140414714813232, - "learning_rate": 3.6435019827880093e-06, - "loss": 0.2309, - "step": 716 - }, - { - "epoch": 3.497560975609756, - "grad_norm": 3.048523426055908, - "learning_rate": 3.640093696550003e-06, - "loss": 0.296, - "step": 717 - }, - { - "epoch": 3.502439024390244, - "grad_norm": 2.9669039249420166, - "learning_rate": 3.6366827327934817e-06, - "loss": 0.2723, - "step": 718 - }, - { - "epoch": 3.5073170731707317, - "grad_norm": 3.6941726207733154, - "learning_rate": 3.6332690995291176e-06, - "loss": 0.3797, - "step": 719 - }, - { - "epoch": 3.5121951219512195, - "grad_norm": 5.135766506195068, - "learning_rate": 3.6298528047738545e-06, - "loss": 0.9868, - "step": 720 - }, - { - "epoch": 3.5170731707317073, - "grad_norm": 3.2021052837371826, - "learning_rate": 3.626433856550886e-06, - "loss": 0.4069, - "step": 721 - }, - { - "epoch": 3.521951219512195, - "grad_norm": 3.094444513320923, - "learning_rate": 3.623012262889637e-06, - "loss": 0.3368, - "step": 722 - }, - { - "epoch": 3.526829268292683, - "grad_norm": 3.609285354614258, - "learning_rate": 3.6195880318257465e-06, - "loss": 0.3972, - "step": 723 - }, - { - "epoch": 3.5317073170731708, - "grad_norm": 4.236501216888428, - "learning_rate": 3.616161171401046e-06, - "loss": 0.52, - "step": 724 - }, - { - "epoch": 3.5365853658536586, - "grad_norm": 3.504526376724243, - "learning_rate": 3.612731689663542e-06, - "loss": 0.23, - "step": 725 - }, - { - "epoch": 3.5414634146341464, - "grad_norm": 3.233591079711914, - "learning_rate": 3.6092995946673996e-06, - "loss": 0.4151, - "step": 726 - }, - { - "epoch": 3.546341463414634, - "grad_norm": 3.6701886653900146, - "learning_rate": 3.605864894472918e-06, - "loss": 0.2798, - "step": 727 - }, - { - "epoch": 3.551219512195122, - "grad_norm": 3.8713181018829346, - "learning_rate": 3.602427597146516e-06, - "loss": 0.4336, - "step": 728 - }, - { - "epoch": 3.55609756097561, - "grad_norm": 5.49612283706665, - "learning_rate": 3.5989877107607134e-06, - "loss": 0.4803, - "step": 729 - }, - { - "epoch": 3.5609756097560976, - "grad_norm": 3.771005392074585, - "learning_rate": 3.5955452433941075e-06, - "loss": 0.3698, - "step": 730 - }, - { - "epoch": 3.5658536585365854, - "grad_norm": 2.970822334289551, - "learning_rate": 3.5921002031313586e-06, - "loss": 0.2373, - "step": 731 - }, - { - "epoch": 3.5707317073170732, - "grad_norm": 3.517249584197998, - "learning_rate": 3.58865259806317e-06, - "loss": 0.1908, - "step": 732 - }, - { - "epoch": 3.575609756097561, - "grad_norm": 3.6825428009033203, - "learning_rate": 3.585202436286267e-06, - "loss": 0.3993, - "step": 733 - }, - { - "epoch": 3.580487804878049, - "grad_norm": 3.387479066848755, - "learning_rate": 3.581749725903381e-06, - "loss": 0.4237, - "step": 734 - }, - { - "epoch": 3.5853658536585367, - "grad_norm": 3.5004806518554688, - "learning_rate": 3.5782944750232274e-06, - "loss": 0.3011, - "step": 735 - }, - { - "epoch": 3.5902439024390245, - "grad_norm": 3.461731433868408, - "learning_rate": 3.574836691760489e-06, - "loss": 0.0896, - "step": 736 - }, - { - "epoch": 3.5951219512195123, - "grad_norm": 3.9598381519317627, - "learning_rate": 3.571376384235795e-06, - "loss": 0.2751, - "step": 737 - }, - { - "epoch": 3.6, - "grad_norm": 4.053933143615723, - "learning_rate": 3.5679135605757035e-06, - "loss": 0.2086, - "step": 738 - }, - { - "epoch": 3.604878048780488, - "grad_norm": 2.9683544635772705, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1659, - "step": 739 - }, - { - "epoch": 3.6097560975609757, - "grad_norm": 3.6598448753356934, - "learning_rate": 3.5609803973850877e-06, - "loss": 0.2469, - "step": 740 - }, - { - "epoch": 3.6146341463414635, - "grad_norm": 3.449335813522339, - "learning_rate": 3.557510074137147e-06, - "loss": 0.375, - "step": 741 - }, - { - "epoch": 3.6195121951219513, - "grad_norm": 2.7666923999786377, - "learning_rate": 3.554037267318942e-06, - "loss": 0.3133, - "step": 742 - }, - { - "epoch": 3.624390243902439, - "grad_norm": 2.8951869010925293, - "learning_rate": 3.5505619850863847e-06, - "loss": 0.2243, - "step": 743 - }, - { - "epoch": 3.629268292682927, - "grad_norm": 3.477747678756714, - "learning_rate": 3.5470842356012007e-06, - "loss": 0.1321, - "step": 744 - }, - { - "epoch": 3.6341463414634148, - "grad_norm": 3.810480833053589, - "learning_rate": 3.5436040270309113e-06, - "loss": 0.361, - "step": 745 - }, - { - "epoch": 3.6390243902439026, - "grad_norm": 3.0730793476104736, - "learning_rate": 3.540121367548811e-06, - "loss": 0.1523, - "step": 746 - }, - { - "epoch": 3.6439024390243904, - "grad_norm": 3.6878390312194824, - "learning_rate": 3.5366362653339524e-06, - "loss": 0.4898, - "step": 747 - }, - { - "epoch": 3.648780487804878, - "grad_norm": 3.6432242393493652, - "learning_rate": 3.533148728571124e-06, - "loss": 0.1397, - "step": 748 - }, - { - "epoch": 3.653658536585366, - "grad_norm": 3.7047760486602783, - "learning_rate": 3.5296587654508317e-06, - "loss": 0.323, - "step": 749 - }, - { - "epoch": 3.658536585365854, - "grad_norm": 3.777132749557495, - "learning_rate": 3.526166384169279e-06, - "loss": 0.5577, - "step": 750 - }, - { - "epoch": 3.6634146341463416, - "grad_norm": 3.7970924377441406, - "learning_rate": 3.5226715929283507e-06, - "loss": 0.245, - "step": 751 - }, - { - "epoch": 3.6682926829268294, - "grad_norm": 2.8203537464141846, - "learning_rate": 3.519174399935588e-06, - "loss": 0.1619, - "step": 752 - }, - { - "epoch": 3.6731707317073172, - "grad_norm": 3.4040987491607666, - "learning_rate": 3.5156748134041767e-06, - "loss": 0.1047, - "step": 753 - }, - { - "epoch": 3.678048780487805, - "grad_norm": 3.927960157394409, - "learning_rate": 3.5121728415529203e-06, - "loss": 0.5713, - "step": 754 - }, - { - "epoch": 3.682926829268293, - "grad_norm": 3.3833277225494385, - "learning_rate": 3.5086684926062266e-06, - "loss": 0.2174, - "step": 755 - }, - { - "epoch": 3.68780487804878, - "grad_norm": 3.989307403564453, - "learning_rate": 3.505161774794085e-06, - "loss": 0.285, - "step": 756 - }, - { - "epoch": 3.692682926829268, - "grad_norm": 2.742429494857788, - "learning_rate": 3.5016526963520474e-06, - "loss": 0.1602, - "step": 757 - }, - { - "epoch": 3.697560975609756, - "grad_norm": 3.7082698345184326, - "learning_rate": 3.498141265521212e-06, - "loss": 0.666, - "step": 758 - }, - { - "epoch": 3.7024390243902436, - "grad_norm": 3.033196210861206, - "learning_rate": 3.4946274905481997e-06, - "loss": 0.2024, - "step": 759 - }, - { - "epoch": 3.7073170731707314, - "grad_norm": 3.7145371437072754, - "learning_rate": 3.4911113796851364e-06, - "loss": 0.2719, - "step": 760 - }, - { - "epoch": 3.7121951219512193, - "grad_norm": 3.580298900604248, - "learning_rate": 3.487592941189636e-06, - "loss": 0.1537, - "step": 761 - }, - { - "epoch": 3.717073170731707, - "grad_norm": 4.753757953643799, - "learning_rate": 3.484072183324776e-06, - "loss": 0.6149, - "step": 762 - }, - { - "epoch": 3.721951219512195, - "grad_norm": 3.5575687885284424, - "learning_rate": 3.4805491143590823e-06, - "loss": 0.4241, - "step": 763 - }, - { - "epoch": 3.7268292682926827, - "grad_norm": 3.215224266052246, - "learning_rate": 3.4770237425665103e-06, - "loss": 0.3037, - "step": 764 - }, - { - "epoch": 3.7317073170731705, - "grad_norm": 2.9899685382843018, - "learning_rate": 3.4734960762264204e-06, - "loss": 0.4854, - "step": 765 - }, - { - "epoch": 3.7365853658536583, - "grad_norm": 3.5880227088928223, - "learning_rate": 3.469966123623563e-06, - "loss": 0.3849, - "step": 766 - }, - { - "epoch": 3.741463414634146, - "grad_norm": 3.472750186920166, - "learning_rate": 3.46643389304806e-06, - "loss": 0.3159, - "step": 767 - }, - { - "epoch": 3.746341463414634, - "grad_norm": 4.355650901794434, - "learning_rate": 3.4628993927953786e-06, - "loss": 0.7527, - "step": 768 - }, - { - "epoch": 3.7512195121951217, - "grad_norm": 2.94575834274292, - "learning_rate": 3.45936263116632e-06, - "loss": 0.1716, - "step": 769 - }, - { - "epoch": 3.7560975609756095, - "grad_norm": 2.991525173187256, - "learning_rate": 3.4558236164669957e-06, - "loss": 0.2061, - "step": 770 - }, - { - "epoch": 3.7609756097560973, - "grad_norm": 3.134000301361084, - "learning_rate": 3.4522823570088073e-06, - "loss": 0.1338, - "step": 771 - }, - { - "epoch": 3.765853658536585, - "grad_norm": 3.722140312194824, - "learning_rate": 3.4487388611084295e-06, - "loss": 0.2615, - "step": 772 - }, - { - "epoch": 3.770731707317073, - "grad_norm": 3.7941153049468994, - "learning_rate": 3.445193137087788e-06, - "loss": 0.1401, - "step": 773 - }, - { - "epoch": 3.7756097560975608, - "grad_norm": 2.872941732406616, - "learning_rate": 3.4416451932740424e-06, - "loss": 0.2934, - "step": 774 - }, - { - "epoch": 3.7804878048780486, - "grad_norm": 4.5019941329956055, - "learning_rate": 3.4380950379995652e-06, - "loss": 0.4579, - "step": 775 - }, - { - "epoch": 3.7853658536585364, - "grad_norm": 2.682884931564331, - "learning_rate": 3.434542679601922e-06, - "loss": 0.2979, - "step": 776 - }, - { - "epoch": 3.790243902439024, - "grad_norm": 3.3044273853302, - "learning_rate": 3.4309881264238538e-06, - "loss": 0.1196, - "step": 777 - }, - { - "epoch": 3.795121951219512, - "grad_norm": 3.102760076522827, - "learning_rate": 3.4274313868132547e-06, - "loss": 0.2026, - "step": 778 - }, - { - "epoch": 3.8, - "grad_norm": 3.3304500579833984, - "learning_rate": 3.4238724691231534e-06, - "loss": 0.2135, - "step": 779 - }, - { - "epoch": 3.8048780487804876, - "grad_norm": 3.295119047164917, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.4418, - "step": 780 - }, - { - "epoch": 3.8097560975609754, - "grad_norm": 3.6655640602111816, - "learning_rate": 3.4167481329421204e-06, - "loss": 0.203, - "step": 781 - }, - { - "epoch": 3.8146341463414632, - "grad_norm": 3.387830972671509, - "learning_rate": 3.4131827311827447e-06, - "loss": 0.3225, - "step": 782 - }, - { - "epoch": 3.819512195121951, - "grad_norm": 2.621633529663086, - "learning_rate": 3.4096151848069416e-06, - "loss": 0.1704, - "step": 783 - }, - { - "epoch": 3.824390243902439, - "grad_norm": 2.974344491958618, - "learning_rate": 3.4060455021931195e-06, - "loss": 0.2785, - "step": 784 - }, - { - "epoch": 3.8292682926829267, - "grad_norm": 3.452131748199463, - "learning_rate": 3.402473691724704e-06, - "loss": 0.223, - "step": 785 - }, - { - "epoch": 3.8341463414634145, - "grad_norm": 2.6373705863952637, - "learning_rate": 3.39889976179012e-06, - "loss": 0.2368, - "step": 786 - }, - { - "epoch": 3.8390243902439023, - "grad_norm": 2.863184928894043, - "learning_rate": 3.3953237207827673e-06, - "loss": 0.3294, - "step": 787 - }, - { - "epoch": 3.84390243902439, - "grad_norm": 5.104704856872559, - "learning_rate": 3.391745577101005e-06, - "loss": 0.5431, - "step": 788 - }, - { - "epoch": 3.848780487804878, - "grad_norm": 3.951310634613037, - "learning_rate": 3.3881653391481306e-06, - "loss": 0.2546, - "step": 789 - }, - { - "epoch": 3.8536585365853657, - "grad_norm": 3.9903225898742676, - "learning_rate": 3.384583015332359e-06, - "loss": 0.3293, - "step": 790 - }, - { - "epoch": 3.8585365853658535, - "grad_norm": 3.3149220943450928, - "learning_rate": 3.380998614066805e-06, - "loss": 0.1861, - "step": 791 - }, - { - "epoch": 3.8634146341463413, - "grad_norm": 3.6755223274230957, - "learning_rate": 3.3774121437694606e-06, - "loss": 0.2498, - "step": 792 - }, - { - "epoch": 3.868292682926829, - "grad_norm": 3.192918300628662, - "learning_rate": 3.3738236128631786e-06, - "loss": 0.1525, - "step": 793 - }, - { - "epoch": 3.873170731707317, - "grad_norm": 3.5358777046203613, - "learning_rate": 3.3702330297756503e-06, - "loss": 0.3622, - "step": 794 - }, - { - "epoch": 3.8780487804878048, - "grad_norm": 3.619878053665161, - "learning_rate": 3.366640402939387e-06, - "loss": 0.1051, - "step": 795 - }, - { - "epoch": 3.8829268292682926, - "grad_norm": 7.085352420806885, - "learning_rate": 3.363045740791698e-06, - "loss": 0.4606, - "step": 796 - }, - { - "epoch": 3.8878048780487804, - "grad_norm": 2.523165464401245, - "learning_rate": 3.3594490517746774e-06, - "loss": 0.2267, - "step": 797 - }, - { - "epoch": 3.892682926829268, - "grad_norm": 2.7026922702789307, - "learning_rate": 3.3558503443351733e-06, - "loss": 0.2792, - "step": 798 - }, - { - "epoch": 3.897560975609756, - "grad_norm": 2.9232428073883057, - "learning_rate": 3.352249626924777e-06, - "loss": 0.2579, - "step": 799 - }, - { - "epoch": 3.902439024390244, - "grad_norm": 4.760788440704346, - "learning_rate": 3.348646907999801e-06, - "loss": 0.6983, - "step": 800 - }, - { - "epoch": 3.9073170731707316, - "grad_norm": 3.198249578475952, - "learning_rate": 3.345042196021257e-06, - "loss": 0.3265, - "step": 801 - }, - { - "epoch": 3.9121951219512194, - "grad_norm": 4.069286823272705, - "learning_rate": 3.3414354994548385e-06, - "loss": 0.497, - "step": 802 - }, - { - "epoch": 3.9170731707317072, - "grad_norm": 3.4435410499572754, - "learning_rate": 3.337826826770898e-06, - "loss": 0.2812, - "step": 803 - }, - { - "epoch": 3.921951219512195, - "grad_norm": 3.9805212020874023, - "learning_rate": 3.3342161864444312e-06, - "loss": 0.2277, - "step": 804 - }, - { - "epoch": 3.926829268292683, - "grad_norm": 3.348925828933716, - "learning_rate": 3.3306035869550534e-06, - "loss": 0.1614, - "step": 805 - }, - { - "epoch": 3.9317073170731707, - "grad_norm": 4.7613701820373535, - "learning_rate": 3.326989036786981e-06, - "loss": 0.3269, - "step": 806 - }, - { - "epoch": 3.9365853658536585, - "grad_norm": 3.807502508163452, - "learning_rate": 3.3233725444290126e-06, - "loss": 0.2619, - "step": 807 - }, - { - "epoch": 3.9414634146341463, - "grad_norm": 3.2690203189849854, - "learning_rate": 3.3197541183745065e-06, - "loss": 0.4334, - "step": 808 - }, - { - "epoch": 3.946341463414634, - "grad_norm": 3.396993398666382, - "learning_rate": 3.3161337671213634e-06, - "loss": 0.2738, - "step": 809 - }, - { - "epoch": 3.951219512195122, - "grad_norm": 3.086669921875, - "learning_rate": 3.312511499172006e-06, - "loss": 0.1597, - "step": 810 - }, - { - "epoch": 3.9560975609756097, - "grad_norm": 3.5688745975494385, - "learning_rate": 3.3088873230333562e-06, - "loss": 0.3195, - "step": 811 - }, - { - "epoch": 3.9609756097560975, - "grad_norm": 3.4843621253967285, - "learning_rate": 3.3052612472168193e-06, - "loss": 0.1865, - "step": 812 - }, - { - "epoch": 3.9658536585365853, - "grad_norm": 2.8479580879211426, - "learning_rate": 3.3016332802382618e-06, - "loss": 0.3108, - "step": 813 - }, - { - "epoch": 3.970731707317073, - "grad_norm": 3.3241543769836426, - "learning_rate": 3.2980034306179897e-06, - "loss": 0.2099, - "step": 814 - }, - { - "epoch": 3.975609756097561, - "grad_norm": 2.817675828933716, - "learning_rate": 3.294371706880733e-06, - "loss": 0.3073, - "step": 815 - }, - { - "epoch": 3.9804878048780488, - "grad_norm": 2.9535388946533203, - "learning_rate": 3.290738117555622e-06, - "loss": 0.2024, - "step": 816 - }, - { - "epoch": 3.9853658536585366, - "grad_norm": 5.021281719207764, - "learning_rate": 3.2871026711761666e-06, - "loss": 0.508, - "step": 817 - }, - { - "epoch": 3.9902439024390244, - "grad_norm": 3.3377649784088135, - "learning_rate": 3.2834653762802414e-06, - "loss": 0.2116, - "step": 818 - }, - { - "epoch": 3.995121951219512, - "grad_norm": 4.412073135375977, - "learning_rate": 3.2798262414100594e-06, - "loss": 0.2177, - "step": 819 - }, - { - "epoch": 4.0, - "grad_norm": 3.174323797225952, - "learning_rate": 3.2761852751121566e-06, - "loss": 0.1737, - "step": 820 - }, - { - "epoch": 4.004878048780488, - "grad_norm": 2.921494960784912, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2569, - "step": 821 - }, - { - "epoch": 4.009756097560976, - "grad_norm": 2.693495512008667, - "learning_rate": 3.2688978824408136e-06, - "loss": 0.1621, - "step": 822 - }, - { - "epoch": 4.014634146341463, - "grad_norm": 2.705796718597412, - "learning_rate": 3.2652514731818698e-06, - "loss": 0.1121, - "step": 823 - }, - { - "epoch": 4.019512195121951, - "grad_norm": 3.2621448040008545, - "learning_rate": 3.2616032667241564e-06, - "loss": 0.0835, - "step": 824 - }, - { - "epoch": 4.024390243902439, - "grad_norm": 3.6205084323883057, - "learning_rate": 3.257953271635513e-06, - "loss": 0.3731, - "step": 825 - }, - { - "epoch": 4.029268292682927, - "grad_norm": 3.2600371837615967, - "learning_rate": 3.2543014964879814e-06, - "loss": 0.1051, - "step": 826 - }, - { - "epoch": 4.034146341463415, - "grad_norm": 3.865178346633911, - "learning_rate": 3.250647949857781e-06, - "loss": 0.0916, - "step": 827 - }, - { - "epoch": 4.0390243902439025, - "grad_norm": 6.9700927734375, - "learning_rate": 3.2469926403252932e-06, - "loss": 0.4037, - "step": 828 - }, - { - "epoch": 4.04390243902439, - "grad_norm": 3.658712148666382, - "learning_rate": 3.2433355764750417e-06, - "loss": 0.0523, - "step": 829 - }, - { - "epoch": 4.048780487804878, - "grad_norm": 4.911301612854004, - "learning_rate": 3.2396767668956656e-06, - "loss": 0.2616, - "step": 830 - }, - { - "epoch": 4.053658536585366, - "grad_norm": 5.019360542297363, - "learning_rate": 3.2360162201799085e-06, - "loss": 0.195, - "step": 831 - }, - { - "epoch": 4.058536585365854, - "grad_norm": 3.493767261505127, - "learning_rate": 3.2323539449245906e-06, - "loss": 0.1245, - "step": 832 - }, - { - "epoch": 4.0634146341463415, - "grad_norm": 4.246248722076416, - "learning_rate": 3.2286899497305917e-06, - "loss": 0.1147, - "step": 833 - }, - { - "epoch": 4.068292682926829, - "grad_norm": 2.993704319000244, - "learning_rate": 3.2250242432028335e-06, - "loss": 0.2189, - "step": 834 - }, - { - "epoch": 4.073170731707317, - "grad_norm": 4.695023059844971, - "learning_rate": 3.221356833950254e-06, - "loss": 0.4685, - "step": 835 - }, - { - "epoch": 4.078048780487805, - "grad_norm": 2.777644634246826, - "learning_rate": 3.21768773058579e-06, - "loss": 0.1245, - "step": 836 - }, - { - "epoch": 4.082926829268293, - "grad_norm": 3.3545901775360107, - "learning_rate": 3.21401694172636e-06, - "loss": 0.1342, - "step": 837 - }, - { - "epoch": 4.087804878048781, - "grad_norm": 2.2222652435302734, - "learning_rate": 3.2103444759928383e-06, - "loss": 0.0484, - "step": 838 - }, - { - "epoch": 4.092682926829268, - "grad_norm": 2.580345630645752, - "learning_rate": 3.2066703420100377e-06, - "loss": 0.0592, - "step": 839 - }, - { - "epoch": 4.097560975609756, - "grad_norm": 3.8652923107147217, - "learning_rate": 3.2029945484066883e-06, - "loss": 0.2536, - "step": 840 - }, - { - "epoch": 4.102439024390244, - "grad_norm": 3.0441582202911377, - "learning_rate": 3.1993171038154203e-06, - "loss": 0.1221, - "step": 841 - }, - { - "epoch": 4.107317073170732, - "grad_norm": 2.2795114517211914, - "learning_rate": 3.1956380168727385e-06, - "loss": 0.1231, - "step": 842 - }, - { - "epoch": 4.11219512195122, - "grad_norm": 3.701009750366211, - "learning_rate": 3.191957296219007e-06, - "loss": 0.2144, - "step": 843 - }, - { - "epoch": 4.117073170731707, - "grad_norm": 3.452637195587158, - "learning_rate": 3.1882749504984247e-06, - "loss": 0.1026, - "step": 844 - }, - { - "epoch": 4.121951219512195, - "grad_norm": 2.4208810329437256, - "learning_rate": 3.1845909883590076e-06, - "loss": 0.1124, - "step": 845 - }, - { - "epoch": 4.126829268292683, - "grad_norm": 4.353063583374023, - "learning_rate": 3.180905418452569e-06, - "loss": 0.2804, - "step": 846 - }, - { - "epoch": 4.131707317073171, - "grad_norm": 3.1151084899902344, - "learning_rate": 3.1772182494346963e-06, - "loss": 0.1748, - "step": 847 - }, - { - "epoch": 4.136585365853659, - "grad_norm": 3.457940101623535, - "learning_rate": 3.1735294899647344e-06, - "loss": 0.1984, - "step": 848 - }, - { - "epoch": 4.1414634146341465, - "grad_norm": 3.3556935787200928, - "learning_rate": 3.169839148705762e-06, - "loss": 0.1332, - "step": 849 - }, - { - "epoch": 4.146341463414634, - "grad_norm": 3.5510823726654053, - "learning_rate": 3.1661472343245725e-06, - "loss": 0.4788, - "step": 850 - }, - { - "epoch": 4.151219512195122, - "grad_norm": 4.036712646484375, - "learning_rate": 3.162453755491655e-06, - "loss": 0.2437, - "step": 851 - }, - { - "epoch": 4.15609756097561, - "grad_norm": 4.417062282562256, - "learning_rate": 3.158758720881171e-06, - "loss": 0.203, - "step": 852 - }, - { - "epoch": 4.160975609756098, - "grad_norm": 3.920558214187622, - "learning_rate": 3.155062139170937e-06, - "loss": 0.1462, - "step": 853 - }, - { - "epoch": 4.1658536585365855, - "grad_norm": 6.472081661224365, - "learning_rate": 3.1513640190424034e-06, - "loss": 0.0972, - "step": 854 - }, - { - "epoch": 4.170731707317073, - "grad_norm": 3.975947141647339, - "learning_rate": 3.147664369180632e-06, - "loss": 0.1092, - "step": 855 - }, - { - "epoch": 4.175609756097561, - "grad_norm": 4.977376937866211, - "learning_rate": 3.143963198274278e-06, - "loss": 0.2215, - "step": 856 - }, - { - "epoch": 4.180487804878049, - "grad_norm": 3.595460891723633, - "learning_rate": 3.140260515015569e-06, - "loss": 0.1771, - "step": 857 - }, - { - "epoch": 4.185365853658537, - "grad_norm": 3.1085658073425293, - "learning_rate": 3.136556328100284e-06, - "loss": 0.1995, - "step": 858 - }, - { - "epoch": 4.190243902439025, - "grad_norm": 4.355626583099365, - "learning_rate": 3.132850646227734e-06, - "loss": 0.4048, - "step": 859 - }, - { - "epoch": 4.195121951219512, - "grad_norm": 3.8079614639282227, - "learning_rate": 3.12914347810074e-06, - "loss": 0.1914, - "step": 860 - }, - { - "epoch": 4.2, - "grad_norm": 3.725804328918457, - "learning_rate": 3.125434832425613e-06, - "loss": 0.1579, - "step": 861 - }, - { - "epoch": 4.204878048780488, - "grad_norm": 2.974649667739868, - "learning_rate": 3.121724717912138e-06, - "loss": 0.1814, - "step": 862 - }, - { - "epoch": 4.209756097560976, - "grad_norm": 3.6391279697418213, - "learning_rate": 3.118013143273542e-06, - "loss": 0.1481, - "step": 863 - }, - { - "epoch": 4.214634146341464, - "grad_norm": 3.216643810272217, - "learning_rate": 3.1143001172264893e-06, - "loss": 0.113, - "step": 864 - }, - { - "epoch": 4.219512195121951, - "grad_norm": 3.605855941772461, - "learning_rate": 3.1105856484910474e-06, - "loss": 0.1405, - "step": 865 - }, - { - "epoch": 4.224390243902439, - "grad_norm": 2.7186765670776367, - "learning_rate": 3.1068697457906736e-06, - "loss": 0.097, - "step": 866 - }, - { - "epoch": 4.229268292682927, - "grad_norm": 3.980973243713379, - "learning_rate": 3.1031524178521938e-06, - "loss": 0.2207, - "step": 867 - }, - { - "epoch": 4.234146341463415, - "grad_norm": 3.4623806476593018, - "learning_rate": 3.0994336734057804e-06, - "loss": 0.0552, - "step": 868 - }, - { - "epoch": 4.239024390243903, - "grad_norm": 3.7556748390197754, - "learning_rate": 3.0957135211849315e-06, - "loss": 0.1743, - "step": 869 - }, - { - "epoch": 4.2439024390243905, - "grad_norm": 3.3547914028167725, - "learning_rate": 3.0919919699264535e-06, - "loss": 0.1195, - "step": 870 - }, - { - "epoch": 4.248780487804878, - "grad_norm": 4.392014503479004, - "learning_rate": 3.0882690283704355e-06, - "loss": 0.6174, - "step": 871 - }, - { - "epoch": 4.253658536585366, - "grad_norm": 2.7031409740448, - "learning_rate": 3.084544705260234e-06, - "loss": 0.1359, - "step": 872 - }, - { - "epoch": 4.258536585365854, - "grad_norm": 2.3518481254577637, - "learning_rate": 3.080819009342451e-06, - "loss": 0.0786, - "step": 873 - }, - { - "epoch": 4.263414634146342, - "grad_norm": 2.636204481124878, - "learning_rate": 3.077091949366908e-06, - "loss": 0.0677, - "step": 874 - }, - { - "epoch": 4.2682926829268295, - "grad_norm": 2.8670942783355713, - "learning_rate": 3.073363534086636e-06, - "loss": 0.1084, - "step": 875 - }, - { - "epoch": 4.273170731707317, - "grad_norm": 2.7044737339019775, - "learning_rate": 3.0696337722578444e-06, - "loss": 0.0681, - "step": 876 - }, - { - "epoch": 4.278048780487805, - "grad_norm": 3.481539487838745, - "learning_rate": 3.0659026726399072e-06, - "loss": 0.2262, - "step": 877 - }, - { - "epoch": 4.282926829268293, - "grad_norm": 3.7746224403381348, - "learning_rate": 3.0621702439953393e-06, - "loss": 0.2169, - "step": 878 - }, - { - "epoch": 4.287804878048781, - "grad_norm": 3.6386263370513916, - "learning_rate": 3.0584364950897768e-06, - "loss": 0.0581, - "step": 879 - }, - { - "epoch": 4.2926829268292686, - "grad_norm": 3.389408588409424, - "learning_rate": 3.0547014346919574e-06, - "loss": 0.1687, - "step": 880 - }, - { - "epoch": 4.297560975609756, - "grad_norm": 3.6510157585144043, - "learning_rate": 3.0509650715736977e-06, - "loss": 0.1362, - "step": 881 - }, - { - "epoch": 4.302439024390244, - "grad_norm": 3.334210157394409, - "learning_rate": 3.0472274145098744e-06, - "loss": 0.1865, - "step": 882 - }, - { - "epoch": 4.307317073170732, - "grad_norm": 4.747341632843018, - "learning_rate": 3.0434884722784026e-06, - "loss": 0.2385, - "step": 883 - }, - { - "epoch": 4.31219512195122, - "grad_norm": 3.9266858100891113, - "learning_rate": 3.0397482536602168e-06, - "loss": 0.1004, - "step": 884 - }, - { - "epoch": 4.317073170731708, - "grad_norm": 2.984821081161499, - "learning_rate": 3.0360067674392475e-06, - "loss": 0.1469, - "step": 885 - }, - { - "epoch": 4.321951219512195, - "grad_norm": 2.6379380226135254, - "learning_rate": 3.0322640224024024e-06, - "loss": 0.0829, - "step": 886 - }, - { - "epoch": 4.326829268292683, - "grad_norm": 3.885495185852051, - "learning_rate": 3.0285200273395478e-06, - "loss": 0.2256, - "step": 887 - }, - { - "epoch": 4.331707317073171, - "grad_norm": 3.950394868850708, - "learning_rate": 3.024774791043481e-06, - "loss": 0.2402, - "step": 888 - }, - { - "epoch": 4.336585365853659, - "grad_norm": 4.147830963134766, - "learning_rate": 3.021028322309921e-06, - "loss": 0.2198, - "step": 889 - }, - { - "epoch": 4.341463414634147, - "grad_norm": 4.0821638107299805, - "learning_rate": 3.0172806299374734e-06, - "loss": 0.2304, - "step": 890 - }, - { - "epoch": 4.3463414634146345, - "grad_norm": 4.142312049865723, - "learning_rate": 3.0135317227276247e-06, - "loss": 0.2864, - "step": 891 - }, - { - "epoch": 4.351219512195122, - "grad_norm": 3.008504867553711, - "learning_rate": 3.0097816094847104e-06, - "loss": 0.2045, - "step": 892 - }, - { - "epoch": 4.35609756097561, - "grad_norm": 3.1674623489379883, - "learning_rate": 3.0060302990158984e-06, - "loss": 0.0864, - "step": 893 - }, - { - "epoch": 4.360975609756098, - "grad_norm": 3.3412492275238037, - "learning_rate": 3.002277800131171e-06, - "loss": 0.076, - "step": 894 - }, - { - "epoch": 4.365853658536586, - "grad_norm": 3.067330837249756, - "learning_rate": 2.998524121643298e-06, - "loss": 0.1724, - "step": 895 - }, - { - "epoch": 4.3707317073170735, - "grad_norm": 3.9015982151031494, - "learning_rate": 2.994769272367822e-06, - "loss": 0.2, - "step": 896 - }, - { - "epoch": 4.375609756097561, - "grad_norm": 3.0136911869049072, - "learning_rate": 2.991013261123035e-06, - "loss": 0.0852, - "step": 897 - }, - { - "epoch": 4.380487804878049, - "grad_norm": 3.6834237575531006, - "learning_rate": 2.9872560967299554e-06, - "loss": 0.1449, - "step": 898 - }, - { - "epoch": 4.385365853658537, - "grad_norm": 3.3486039638519287, - "learning_rate": 2.9834977880123132e-06, - "loss": 0.0659, - "step": 899 - }, - { - "epoch": 4.390243902439025, - "grad_norm": 2.971315622329712, - "learning_rate": 2.9797383437965243e-06, - "loss": 0.1114, - "step": 900 - }, - { - "epoch": 4.3951219512195125, - "grad_norm": 2.683359146118164, - "learning_rate": 2.975977772911671e-06, - "loss": 0.0822, - "step": 901 - }, - { - "epoch": 4.4, - "grad_norm": 2.9941935539245605, - "learning_rate": 2.972216084189482e-06, - "loss": 0.0858, - "step": 902 - }, - { - "epoch": 4.404878048780488, - "grad_norm": 2.4938626289367676, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.1162, - "step": 903 - }, - { - "epoch": 4.409756097560976, - "grad_norm": 2.9364712238311768, - "learning_rate": 2.964689388573118e-06, - "loss": 0.0821, - "step": 904 - }, - { - "epoch": 4.414634146341464, - "grad_norm": 3.3638134002685547, - "learning_rate": 2.9609243993554434e-06, - "loss": 0.25, - "step": 905 - }, - { - "epoch": 4.419512195121952, - "grad_norm": 3.657277822494507, - "learning_rate": 2.9571583276533923e-06, - "loss": 0.0852, - "step": 906 - }, - { - "epoch": 4.424390243902439, - "grad_norm": 5.486263275146484, - "learning_rate": 2.9533911823116124e-06, - "loss": 0.5123, - "step": 907 - }, - { - "epoch": 4.429268292682927, - "grad_norm": 5.194574356079102, - "learning_rate": 2.9496229721772734e-06, - "loss": 0.1854, - "step": 908 - }, - { - "epoch": 4.434146341463415, - "grad_norm": 3.520110845565796, - "learning_rate": 2.9458537061000435e-06, - "loss": 0.1785, - "step": 909 - }, - { - "epoch": 4.439024390243903, - "grad_norm": 3.417991876602173, - "learning_rate": 2.9420833929320726e-06, - "loss": 0.1603, - "step": 910 - }, - { - "epoch": 4.443902439024391, - "grad_norm": 5.225805282592773, - "learning_rate": 2.93831204152797e-06, - "loss": 0.3046, - "step": 911 - }, - { - "epoch": 4.4487804878048784, - "grad_norm": 3.541433572769165, - "learning_rate": 2.9345396607447807e-06, - "loss": 0.0631, - "step": 912 - }, - { - "epoch": 4.453658536585366, - "grad_norm": 3.909377098083496, - "learning_rate": 2.9307662594419704e-06, - "loss": 0.125, - "step": 913 - }, - { - "epoch": 4.458536585365854, - "grad_norm": 3.6604416370391846, - "learning_rate": 2.9269918464814e-06, - "loss": 0.156, - "step": 914 - }, - { - "epoch": 4.463414634146342, - "grad_norm": 3.7413833141326904, - "learning_rate": 2.923216430727306e-06, - "loss": 0.3334, - "step": 915 - }, - { - "epoch": 4.46829268292683, - "grad_norm": 3.531996011734009, - "learning_rate": 2.9194400210462808e-06, - "loss": 0.2534, - "step": 916 - }, - { - "epoch": 4.473170731707317, - "grad_norm": 4.163621425628662, - "learning_rate": 2.91566262630725e-06, - "loss": 0.352, - "step": 917 - }, - { - "epoch": 4.478048780487805, - "grad_norm": 3.923635482788086, - "learning_rate": 2.9118842553814526e-06, - "loss": 0.1132, - "step": 918 - }, - { - "epoch": 4.482926829268292, - "grad_norm": 2.833768844604492, - "learning_rate": 2.9081049171424223e-06, - "loss": 0.086, - "step": 919 - }, - { - "epoch": 4.487804878048781, - "grad_norm": 2.9006292819976807, - "learning_rate": 2.9043246204659624e-06, - "loss": 0.0693, - "step": 920 - }, - { - "epoch": 4.492682926829268, - "grad_norm": 3.699376344680786, - "learning_rate": 2.9005433742301274e-06, - "loss": 0.2463, - "step": 921 - }, - { - "epoch": 4.4975609756097565, - "grad_norm": 4.882141590118408, - "learning_rate": 2.8967611873152037e-06, - "loss": 0.2275, - "step": 922 - }, - { - "epoch": 4.5024390243902435, - "grad_norm": 3.0554678440093994, - "learning_rate": 2.892978068603683e-06, - "loss": 0.0752, - "step": 923 - }, - { - "epoch": 4.507317073170732, - "grad_norm": 3.1225268840789795, - "learning_rate": 2.889194026980249e-06, - "loss": 0.1649, - "step": 924 - }, - { - "epoch": 4.512195121951219, - "grad_norm": 17.75234031677246, - "learning_rate": 2.8854090713317514e-06, - "loss": 0.0437, - "step": 925 - }, - { - "epoch": 4.517073170731708, - "grad_norm": 3.011223554611206, - "learning_rate": 2.8816232105471864e-06, - "loss": 0.0747, - "step": 926 - }, - { - "epoch": 4.521951219512195, - "grad_norm": 4.327573299407959, - "learning_rate": 2.877836453517677e-06, - "loss": 0.3884, - "step": 927 - }, - { - "epoch": 4.526829268292683, - "grad_norm": 3.8694965839385986, - "learning_rate": 2.8740488091364492e-06, - "loss": 0.2741, - "step": 928 - }, - { - "epoch": 4.53170731707317, - "grad_norm": 5.375877380371094, - "learning_rate": 2.870260286298814e-06, - "loss": 0.364, - "step": 929 - }, - { - "epoch": 4.536585365853659, - "grad_norm": 3.380891799926758, - "learning_rate": 2.866470893902147e-06, - "loss": 0.1495, - "step": 930 - }, - { - "epoch": 4.541463414634146, - "grad_norm": 3.723992109298706, - "learning_rate": 2.8626806408458626e-06, - "loss": 0.1403, - "step": 931 - }, - { - "epoch": 4.546341463414635, - "grad_norm": 3.0534417629241943, - "learning_rate": 2.8588895360313983e-06, - "loss": 0.0946, - "step": 932 - }, - { - "epoch": 4.5512195121951216, - "grad_norm": 2.8875234127044678, - "learning_rate": 2.8550975883621935e-06, - "loss": 0.1851, - "step": 933 - }, - { - "epoch": 4.55609756097561, - "grad_norm": 3.532166004180908, - "learning_rate": 2.8513048067436644e-06, - "loss": 0.178, - "step": 934 - }, - { - "epoch": 4.560975609756097, - "grad_norm": 2.942798376083374, - "learning_rate": 2.847511200083187e-06, - "loss": 0.1131, - "step": 935 - }, - { - "epoch": 4.565853658536585, - "grad_norm": 2.926874876022339, - "learning_rate": 2.843716777290074e-06, - "loss": 0.1251, - "step": 936 - }, - { - "epoch": 4.570731707317073, - "grad_norm": 3.525895357131958, - "learning_rate": 2.839921547275556e-06, - "loss": 0.0946, - "step": 937 - }, - { - "epoch": 4.575609756097561, - "grad_norm": 3.7033681869506836, - "learning_rate": 2.836125518952759e-06, - "loss": 0.1529, - "step": 938 - }, - { - "epoch": 4.580487804878048, - "grad_norm": 3.235154867172241, - "learning_rate": 2.8323287012366845e-06, - "loss": 0.2511, - "step": 939 - }, - { - "epoch": 4.585365853658536, - "grad_norm": 3.5275583267211914, - "learning_rate": 2.828531103044186e-06, - "loss": 0.1474, - "step": 940 - }, - { - "epoch": 4.590243902439024, - "grad_norm": 3.1356353759765625, - "learning_rate": 2.8247327332939512e-06, - "loss": 0.2249, - "step": 941 - }, - { - "epoch": 4.595121951219512, - "grad_norm": 3.789210081100464, - "learning_rate": 2.82093360090648e-06, - "loss": 0.2258, - "step": 942 - }, - { - "epoch": 4.6, - "grad_norm": 4.841623306274414, - "learning_rate": 2.8171337148040636e-06, - "loss": 0.2235, - "step": 943 - }, - { - "epoch": 4.6048780487804875, - "grad_norm": 3.161630630493164, - "learning_rate": 2.813333083910761e-06, - "loss": 0.1562, - "step": 944 - }, - { - "epoch": 4.609756097560975, - "grad_norm": 2.8718132972717285, - "learning_rate": 2.8095317171523835e-06, - "loss": 0.0625, - "step": 945 - }, - { - "epoch": 4.614634146341463, - "grad_norm": 3.6432454586029053, - "learning_rate": 2.805729623456469e-06, - "loss": 0.2205, - "step": 946 - }, - { - "epoch": 4.619512195121951, - "grad_norm": 4.382034778594971, - "learning_rate": 2.8019268117522624e-06, - "loss": 0.3241, - "step": 947 - }, - { - "epoch": 4.624390243902439, - "grad_norm": 3.2998175621032715, - "learning_rate": 2.798123290970695e-06, - "loss": 0.1983, - "step": 948 - }, - { - "epoch": 4.6292682926829265, - "grad_norm": 3.8665990829467773, - "learning_rate": 2.794319070044365e-06, - "loss": 0.3391, - "step": 949 - }, - { - "epoch": 4.634146341463414, - "grad_norm": 3.628403425216675, - "learning_rate": 2.790514157907512e-06, - "loss": 0.1329, - "step": 950 - }, - { - "epoch": 4.639024390243902, - "grad_norm": 2.8889615535736084, - "learning_rate": 2.786708563496002e-06, - "loss": 0.141, - "step": 951 - }, - { - "epoch": 4.64390243902439, - "grad_norm": 4.07351541519165, - "learning_rate": 2.782902295747299e-06, - "loss": 0.2935, - "step": 952 - }, - { - "epoch": 4.648780487804878, - "grad_norm": 4.220067024230957, - "learning_rate": 2.7790953636004536e-06, - "loss": 0.318, - "step": 953 - }, - { - "epoch": 4.6536585365853655, - "grad_norm": 3.8444325923919678, - "learning_rate": 2.775287775996074e-06, - "loss": 0.3388, - "step": 954 - }, - { - "epoch": 4.658536585365853, - "grad_norm": 3.197313070297241, - "learning_rate": 2.7714795418763067e-06, - "loss": 0.0925, - "step": 955 - }, - { - "epoch": 4.663414634146341, - "grad_norm": 4.0050811767578125, - "learning_rate": 2.7676706701848187e-06, - "loss": 0.2811, - "step": 956 - }, - { - "epoch": 4.668292682926829, - "grad_norm": 3.217160224914551, - "learning_rate": 2.763861169866774e-06, - "loss": 0.311, - "step": 957 - }, - { - "epoch": 4.673170731707317, - "grad_norm": 2.9892494678497314, - "learning_rate": 2.7600510498688104e-06, - "loss": 0.0582, - "step": 958 - }, - { - "epoch": 4.678048780487805, - "grad_norm": 3.954805374145508, - "learning_rate": 2.7562403191390246e-06, - "loss": 0.1238, - "step": 959 - }, - { - "epoch": 4.682926829268292, - "grad_norm": 2.9582695960998535, - "learning_rate": 2.7524289866269467e-06, - "loss": 0.1243, - "step": 960 - }, - { - "epoch": 4.68780487804878, - "grad_norm": 2.807002544403076, - "learning_rate": 2.748617061283518e-06, - "loss": 0.1388, - "step": 961 - }, - { - "epoch": 4.692682926829268, - "grad_norm": 3.980499505996704, - "learning_rate": 2.744804552061074e-06, - "loss": 0.1144, - "step": 962 - }, - { - "epoch": 4.697560975609756, - "grad_norm": 3.6389007568359375, - "learning_rate": 2.740991467913321e-06, - "loss": 0.2155, - "step": 963 - }, - { - "epoch": 4.702439024390244, - "grad_norm": 3.0950801372528076, - "learning_rate": 2.737177817795315e-06, - "loss": 0.0983, - "step": 964 - }, - { - "epoch": 4.7073170731707314, - "grad_norm": 3.1723053455352783, - "learning_rate": 2.7333636106634414e-06, - "loss": 0.1365, - "step": 965 - }, - { - "epoch": 4.712195121951219, - "grad_norm": 3.83921217918396, - "learning_rate": 2.7295488554753957e-06, - "loss": 0.1977, - "step": 966 - }, - { - "epoch": 4.717073170731707, - "grad_norm": 3.348057746887207, - "learning_rate": 2.725733561190157e-06, - "loss": 0.1311, - "step": 967 - }, - { - "epoch": 4.721951219512195, - "grad_norm": 3.828483819961548, - "learning_rate": 2.721917736767973e-06, - "loss": 0.2464, - "step": 968 - }, - { - "epoch": 4.726829268292683, - "grad_norm": 2.6004624366760254, - "learning_rate": 2.7181013911703357e-06, - "loss": 0.1088, - "step": 969 - }, - { - "epoch": 4.7317073170731705, - "grad_norm": 3.316990852355957, - "learning_rate": 2.714284533359961e-06, - "loss": 0.1492, - "step": 970 - }, - { - "epoch": 4.736585365853658, - "grad_norm": 3.8770010471343994, - "learning_rate": 2.710467172300768e-06, - "loss": 0.218, - "step": 971 - }, - { - "epoch": 4.741463414634146, - "grad_norm": 4.456376552581787, - "learning_rate": 2.706649316957857e-06, - "loss": 0.2199, - "step": 972 - }, - { - "epoch": 4.746341463414634, - "grad_norm": 3.3376309871673584, - "learning_rate": 2.7028309762974897e-06, - "loss": 0.0595, - "step": 973 - }, - { - "epoch": 4.751219512195122, - "grad_norm": 3.6755495071411133, - "learning_rate": 2.699012159287069e-06, - "loss": 0.1653, - "step": 974 - }, - { - "epoch": 4.7560975609756095, - "grad_norm": 2.939887046813965, - "learning_rate": 2.6951928748951125e-06, - "loss": 0.0681, - "step": 975 - }, - { - "epoch": 4.760975609756097, - "grad_norm": 3.4101195335388184, - "learning_rate": 2.69137313209124e-06, - "loss": 0.2046, - "step": 976 - }, - { - "epoch": 4.765853658536585, - "grad_norm": 3.9811208248138428, - "learning_rate": 2.687552939846145e-06, - "loss": 0.2255, - "step": 977 - }, - { - "epoch": 4.770731707317073, - "grad_norm": 3.484255313873291, - "learning_rate": 2.6837323071315766e-06, - "loss": 0.0512, - "step": 978 - }, - { - "epoch": 4.775609756097561, - "grad_norm": 3.9005143642425537, - "learning_rate": 2.679911242920321e-06, - "loss": 0.162, - "step": 979 - }, - { - "epoch": 4.780487804878049, - "grad_norm": 4.933374881744385, - "learning_rate": 2.6760897561861742e-06, - "loss": 0.398, - "step": 980 - }, - { - "epoch": 4.785365853658536, - "grad_norm": 3.0741539001464844, - "learning_rate": 2.672267855903927e-06, - "loss": 0.0507, - "step": 981 - }, - { - "epoch": 4.790243902439024, - "grad_norm": 3.023772716522217, - "learning_rate": 2.6684455510493413e-06, - "loss": 0.2066, - "step": 982 - }, - { - "epoch": 4.795121951219512, - "grad_norm": 3.0102407932281494, - "learning_rate": 2.6646228505991267e-06, - "loss": 0.2296, - "step": 983 - }, - { - "epoch": 4.8, - "grad_norm": 3.902200222015381, - "learning_rate": 2.6607997635309246e-06, - "loss": 0.14, - "step": 984 - }, - { - "epoch": 4.804878048780488, - "grad_norm": 3.836185932159424, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.1583, - "step": 985 - }, - { - "epoch": 4.809756097560975, - "grad_norm": 3.539628744125366, - "learning_rate": 2.653152465455639e-06, - "loss": 0.2619, - "step": 986 - }, - { - "epoch": 4.814634146341463, - "grad_norm": 4.716914653778076, - "learning_rate": 2.6493282724082913e-06, - "loss": 0.3029, - "step": 987 - }, - { - "epoch": 4.819512195121951, - "grad_norm": 3.466914176940918, - "learning_rate": 2.6455037286623864e-06, - "loss": 0.095, - "step": 988 - }, - { - "epoch": 4.824390243902439, - "grad_norm": 2.1798667907714844, - "learning_rate": 2.6416788431998935e-06, - "loss": 0.1232, - "step": 989 - }, - { - "epoch": 4.829268292682927, - "grad_norm": 3.309039354324341, - "learning_rate": 2.637853625003585e-06, - "loss": 0.3671, - "step": 990 - }, - { - "epoch": 4.8341463414634145, - "grad_norm": 3.2619435787200928, - "learning_rate": 2.6340280830570142e-06, - "loss": 0.194, - "step": 991 - }, - { - "epoch": 4.839024390243902, - "grad_norm": 3.601161003112793, - "learning_rate": 2.6302022263444947e-06, - "loss": 0.1214, - "step": 992 - }, - { - "epoch": 4.84390243902439, - "grad_norm": 4.13787841796875, - "learning_rate": 2.6263760638510793e-06, - "loss": 0.311, - "step": 993 - }, - { - "epoch": 4.848780487804878, - "grad_norm": 3.0474166870117188, - "learning_rate": 2.6225496045625394e-06, - "loss": 0.1853, - "step": 994 - }, - { - "epoch": 4.853658536585366, - "grad_norm": 4.481237411499023, - "learning_rate": 2.6187228574653428e-06, - "loss": 0.2088, - "step": 995 - }, - { - "epoch": 4.8585365853658535, - "grad_norm": 3.235966444015503, - "learning_rate": 2.614895831546633e-06, - "loss": 0.1439, - "step": 996 - }, - { - "epoch": 4.863414634146341, - "grad_norm": 4.103270053863525, - "learning_rate": 2.6110685357942096e-06, - "loss": 0.2823, - "step": 997 - }, - { - "epoch": 4.868292682926829, - "grad_norm": 4.134536266326904, - "learning_rate": 2.6072409791965048e-06, - "loss": 0.2963, - "step": 998 - }, - { - "epoch": 4.873170731707317, - "grad_norm": 4.124892711639404, - "learning_rate": 2.6034131707425638e-06, - "loss": 0.4127, - "step": 999 - }, - { - "epoch": 4.878048780487805, - "grad_norm": 3.565139055252075, - "learning_rate": 2.5995851194220223e-06, - "loss": 0.1601, - "step": 1000 - }, - { - "epoch": 4.882926829268293, - "grad_norm": 2.7548017501831055, - "learning_rate": 2.595756834225089e-06, - "loss": 0.161, - "step": 1001 - }, - { - "epoch": 4.88780487804878, - "grad_norm": 3.9297611713409424, - "learning_rate": 2.5919283241425188e-06, - "loss": 0.1013, - "step": 1002 - }, - { - "epoch": 4.892682926829268, - "grad_norm": 2.4904236793518066, - "learning_rate": 2.5880995981655965e-06, - "loss": 0.1177, - "step": 1003 - }, - { - "epoch": 4.897560975609756, - "grad_norm": 3.513308048248291, - "learning_rate": 2.584270665286113e-06, - "loss": 0.0682, - "step": 1004 - }, - { - "epoch": 4.902439024390244, - "grad_norm": 4.221067428588867, - "learning_rate": 2.580441534496346e-06, - "loss": 0.1502, - "step": 1005 - }, - { - "epoch": 4.907317073170732, - "grad_norm": 3.4298903942108154, - "learning_rate": 2.576612214789039e-06, - "loss": 0.1772, - "step": 1006 - }, - { - "epoch": 4.912195121951219, - "grad_norm": 4.402887344360352, - "learning_rate": 2.5727827151573747e-06, - "loss": 0.2029, - "step": 1007 - }, - { - "epoch": 4.917073170731707, - "grad_norm": 4.194999694824219, - "learning_rate": 2.568953044594964e-06, - "loss": 0.1269, - "step": 1008 - }, - { - "epoch": 4.921951219512195, - "grad_norm": 3.657607078552246, - "learning_rate": 2.5651232120958157e-06, - "loss": 0.1311, - "step": 1009 - }, - { - "epoch": 4.926829268292683, - "grad_norm": 4.092184543609619, - "learning_rate": 2.56129322665432e-06, - "loss": 0.1085, - "step": 1010 - }, - { - "epoch": 4.931707317073171, - "grad_norm": 3.3648242950439453, - "learning_rate": 2.5574630972652263e-06, - "loss": 0.0782, - "step": 1011 - }, - { - "epoch": 4.9365853658536585, - "grad_norm": 3.7215166091918945, - "learning_rate": 2.553632832923622e-06, - "loss": 0.1391, - "step": 1012 - }, - { - "epoch": 4.941463414634146, - "grad_norm": 4.045740127563477, - "learning_rate": 2.5498024426249107e-06, - "loss": 0.3141, - "step": 1013 - }, - { - "epoch": 4.946341463414634, - "grad_norm": 3.2363107204437256, - "learning_rate": 2.545971935364794e-06, - "loss": 0.0679, - "step": 1014 - }, - { - "epoch": 4.951219512195122, - "grad_norm": 3.057283639907837, - "learning_rate": 2.5421413201392443e-06, - "loss": 0.1382, - "step": 1015 - }, - { - "epoch": 4.95609756097561, - "grad_norm": 3.591535806655884, - "learning_rate": 2.538310605944491e-06, - "loss": 0.112, - "step": 1016 - }, - { - "epoch": 4.9609756097560975, - "grad_norm": 3.1629281044006348, - "learning_rate": 2.534479801776996e-06, - "loss": 0.1261, - "step": 1017 - }, - { - "epoch": 4.965853658536585, - "grad_norm": 2.691740036010742, - "learning_rate": 2.53064891663343e-06, - "loss": 0.2328, - "step": 1018 - }, - { - "epoch": 4.970731707317073, - "grad_norm": 3.2620503902435303, - "learning_rate": 2.526817959510655e-06, - "loss": 0.193, - "step": 1019 - }, - { - "epoch": 4.975609756097561, - "grad_norm": 3.0721535682678223, - "learning_rate": 2.5229869394057038e-06, - "loss": 0.2444, - "step": 1020 - }, - { - "epoch": 4.980487804878049, - "grad_norm": 2.6279208660125732, - "learning_rate": 2.5191558653157542e-06, - "loss": 0.1103, - "step": 1021 - }, - { - "epoch": 4.985365853658537, - "grad_norm": 2.9295670986175537, - "learning_rate": 2.515324746238113e-06, - "loss": 0.0553, - "step": 1022 - }, - { - "epoch": 4.990243902439024, - "grad_norm": 3.3960084915161133, - "learning_rate": 2.511493591170191e-06, - "loss": 0.1686, - "step": 1023 - }, - { - "epoch": 4.995121951219512, - "grad_norm": 4.138705253601074, - "learning_rate": 2.5076624091094846e-06, - "loss": 0.1208, - "step": 1024 - }, - { - "epoch": 5.0, - "grad_norm": 2.603870391845703, - "learning_rate": 2.503831209053554e-06, - "loss": 0.1216, - "step": 1025 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 2.950975240148091e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-1230/chat_template.jinja b/metallama3_8b/limo/checkpoint-1230/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-1230/config.json b/metallama3_8b/limo/checkpoint-1230/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-1230/generation_config.json b/metallama3_8b/limo/checkpoint-1230/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-1230/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1230/model-00001-of-00007.safetensors deleted file mode 100644 index da3ded2a689a8346120577a5a20a337845ee6851..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ac2a731bf96352a97887ebd2906cdecb11494c9389c7e7385d1308e5994f2ae -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-1230/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1230/model-00002-of-00007.safetensors deleted file mode 100644 index e323648de8550afb47f20971e32e96591a39a476..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae4fc9a70211b238ac80fc1245cc91a865e5a77ad214403912bfdb34355ee884 -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-1230/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1230/model-00003-of-00007.safetensors deleted file mode 100644 index 9ab7958e4e76b62c8b669ee174c9a1687d2d1fc3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7fbb9045cce8fb44feff64644f9032b65f18da3f34511c1036038f1bd60d4336 -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-1230/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1230/model-00004-of-00007.safetensors deleted file mode 100644 index d260c79bf7485cc559dc753097b21855dc77174f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9e9b735ded36488642d5ef89fc49bdbd41b26c355c9631fef97505b238a9b2a0 -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-1230/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1230/model-00005-of-00007.safetensors deleted file mode 100644 index 70387d9de9907d9c9d957a4981c8b71f09340f72..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dad43b53f6657ae1c52e4d67f2236e8416ce16ad08f5768f3b2cbbc1761078c7 -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-1230/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1230/model-00006-of-00007.safetensors deleted file mode 100644 index 277f23c54d33653557b83acc0d2a790cd69f830a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c14eeb7242dc65e4e31d9905eab43df9407ec5fc8c82d329e8dfbf460e5dcb2 -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-1230/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1230/model-00007-of-00007.safetensors deleted file mode 100644 index 5fd07e48216e935e290c8bd487a9a9c0ecb0d20e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b1852c10fdd878cf2192b706502c9358b0b6362c74363a4d41c8f4ae0626eba -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-1230/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-1230/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-1230/rng_state_0.pth b/metallama3_8b/limo/checkpoint-1230/rng_state_0.pth deleted file mode 100644 index c54ea122b283c04f6b60c1eedefeb301763a8f9f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:418a5f105ae834c3075024076916b2a9475918fe034c12d0dd5b6d91f1aba467 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1230/rng_state_1.pth b/metallama3_8b/limo/checkpoint-1230/rng_state_1.pth deleted file mode 100644 index ea57ead2533e587fe50f62107d7cb32945fe1354..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e07ace389d24bc1307b74f42a1e7b8f0117b0db853e2df64ff3f15cb92916a2 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1230/rng_state_2.pth b/metallama3_8b/limo/checkpoint-1230/rng_state_2.pth deleted file mode 100644 index 4689a9445d07528dc4fd91011a7f034c11773a68..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da6a990f346d7014dffb28fa2bc7d3b890bd3c53712503fce3656da48d3d6e50 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1230/rng_state_3.pth b/metallama3_8b/limo/checkpoint-1230/rng_state_3.pth deleted file mode 100644 index 919b5e43a96a9afdeb196f402142bc3aab67f247..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e95f356ca38179b05993f55daece0223e96fa10b9a1b9ea2102a739211333f63 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1230/scheduler.pt b/metallama3_8b/limo/checkpoint-1230/scheduler.pt deleted file mode 100644 index 49fce6ee6ec79d995db4e6c671ab43a493bb02ab..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52f7b535b1edea536614abde861d1ff7ba7967dbe32fcf39458e08c39d5bc336 -size 1064 diff --git a/metallama3_8b/limo/checkpoint-1230/special_tokens_map.json b/metallama3_8b/limo/checkpoint-1230/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-1230/tokenizer.json b/metallama3_8b/limo/checkpoint-1230/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-1230/tokenizer_config.json b/metallama3_8b/limo/checkpoint-1230/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-1230/trainer_state.json b/metallama3_8b/limo/checkpoint-1230/trainer_state.json deleted file mode 100644 index b9620e9e508dc6c702a14cd1b0eef9cd19a96194..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1230/trainer_state.json +++ /dev/null @@ -1,8644 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 6.0, - "eval_steps": 500, - "global_step": 1230, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - }, - { - "epoch": 3.004878048780488, - "grad_norm": 2.787429094314575, - "learning_rate": 3.969463130731183e-06, - "loss": 0.2403, - "step": 616 - }, - { - "epoch": 3.0097560975609756, - "grad_norm": 3.0412075519561768, - "learning_rate": 3.966361891975316e-06, - "loss": 0.2635, - "step": 617 - }, - { - "epoch": 3.0146341463414634, - "grad_norm": 2.9949851036071777, - "learning_rate": 3.963257209457652e-06, - "loss": 0.3294, - "step": 618 - }, - { - "epoch": 3.0195121951219512, - "grad_norm": 3.0510809421539307, - "learning_rate": 3.960149090469561e-06, - "loss": 0.1338, - "step": 619 - }, - { - "epoch": 3.024390243902439, - "grad_norm": 3.669482707977295, - "learning_rate": 3.957037542310484e-06, - "loss": 0.1469, - "step": 620 - }, - { - "epoch": 3.029268292682927, - "grad_norm": 4.677116870880127, - "learning_rate": 3.953922572287915e-06, - "loss": 0.2788, - "step": 621 - }, - { - "epoch": 3.0341463414634147, - "grad_norm": 4.33144474029541, - "learning_rate": 3.950804187717384e-06, - "loss": 0.4521, - "step": 622 - }, - { - "epoch": 3.0390243902439025, - "grad_norm": 3.466639757156372, - "learning_rate": 3.947682395922439e-06, - "loss": 0.5113, - "step": 623 - }, - { - "epoch": 3.0439024390243903, - "grad_norm": 3.2332122325897217, - "learning_rate": 3.9445572042346346e-06, - "loss": 0.0968, - "step": 624 - }, - { - "epoch": 3.048780487804878, - "grad_norm": 2.6108055114746094, - "learning_rate": 3.941428619993505e-06, - "loss": 0.2462, - "step": 625 - }, - { - "epoch": 3.053658536585366, - "grad_norm": 3.2512595653533936, - "learning_rate": 3.938296650546552e-06, - "loss": 0.1782, - "step": 626 - }, - { - "epoch": 3.0585365853658537, - "grad_norm": 3.4350366592407227, - "learning_rate": 3.935161303249231e-06, - "loss": 0.2955, - "step": 627 - }, - { - "epoch": 3.0634146341463415, - "grad_norm": 3.42012619972229, - "learning_rate": 3.932022585464928e-06, - "loss": 0.3259, - "step": 628 - }, - { - "epoch": 3.0682926829268293, - "grad_norm": 3.458043336868286, - "learning_rate": 3.928880504564943e-06, - "loss": 0.2306, - "step": 629 - }, - { - "epoch": 3.073170731707317, - "grad_norm": 2.646616220474243, - "learning_rate": 3.92573506792848e-06, - "loss": 0.2197, - "step": 630 - }, - { - "epoch": 3.078048780487805, - "grad_norm": 3.5558857917785645, - "learning_rate": 3.9225862829426184e-06, - "loss": 0.1607, - "step": 631 - }, - { - "epoch": 3.0829268292682928, - "grad_norm": 3.6011338233947754, - "learning_rate": 3.919434157002303e-06, - "loss": 0.3087, - "step": 632 - }, - { - "epoch": 3.0878048780487806, - "grad_norm": 2.339879035949707, - "learning_rate": 3.916278697510325e-06, - "loss": 0.2213, - "step": 633 - }, - { - "epoch": 3.0926829268292684, - "grad_norm": 3.268162488937378, - "learning_rate": 3.913119911877305e-06, - "loss": 0.318, - "step": 634 - }, - { - "epoch": 3.097560975609756, - "grad_norm": 4.062571048736572, - "learning_rate": 3.909957807521674e-06, - "loss": 0.1757, - "step": 635 - }, - { - "epoch": 3.102439024390244, - "grad_norm": 2.997659683227539, - "learning_rate": 3.906792391869657e-06, - "loss": 0.2391, - "step": 636 - }, - { - "epoch": 3.107317073170732, - "grad_norm": 3.7037394046783447, - "learning_rate": 3.903623672355258e-06, - "loss": 0.2548, - "step": 637 - }, - { - "epoch": 3.1121951219512196, - "grad_norm": 3.110579252243042, - "learning_rate": 3.900451656420237e-06, - "loss": 0.2389, - "step": 638 - }, - { - "epoch": 3.1170731707317074, - "grad_norm": 3.3332321643829346, - "learning_rate": 3.897276351514097e-06, - "loss": 0.1371, - "step": 639 - }, - { - "epoch": 3.1219512195121952, - "grad_norm": 3.8275935649871826, - "learning_rate": 3.894097765094065e-06, - "loss": 0.3363, - "step": 640 - }, - { - "epoch": 3.126829268292683, - "grad_norm": 2.3731374740600586, - "learning_rate": 3.890915904625075e-06, - "loss": 0.1314, - "step": 641 - }, - { - "epoch": 3.131707317073171, - "grad_norm": 3.1511282920837402, - "learning_rate": 3.887730777579751e-06, - "loss": 0.3563, - "step": 642 - }, - { - "epoch": 3.1365853658536587, - "grad_norm": 4.2254862785339355, - "learning_rate": 3.884542391438387e-06, - "loss": 0.5053, - "step": 643 - }, - { - "epoch": 3.1414634146341465, - "grad_norm": 4.579670429229736, - "learning_rate": 3.88135075368893e-06, - "loss": 0.6259, - "step": 644 - }, - { - "epoch": 3.1463414634146343, - "grad_norm": 3.2102746963500977, - "learning_rate": 3.878155871826968e-06, - "loss": 0.2599, - "step": 645 - }, - { - "epoch": 3.151219512195122, - "grad_norm": 2.5569686889648438, - "learning_rate": 3.874957753355701e-06, - "loss": 0.2075, - "step": 646 - }, - { - "epoch": 3.15609756097561, - "grad_norm": 3.588925838470459, - "learning_rate": 3.8717564057859365e-06, - "loss": 0.4577, - "step": 647 - }, - { - "epoch": 3.1609756097560977, - "grad_norm": 3.6163878440856934, - "learning_rate": 3.868551836636063e-06, - "loss": 0.4023, - "step": 648 - }, - { - "epoch": 3.1658536585365855, - "grad_norm": 3.8688390254974365, - "learning_rate": 3.865344053432035e-06, - "loss": 0.1669, - "step": 649 - }, - { - "epoch": 3.1707317073170733, - "grad_norm": 3.419734001159668, - "learning_rate": 3.862133063707353e-06, - "loss": 0.2766, - "step": 650 - }, - { - "epoch": 3.175609756097561, - "grad_norm": 2.9860243797302246, - "learning_rate": 3.858918875003053e-06, - "loss": 0.1788, - "step": 651 - }, - { - "epoch": 3.180487804878049, - "grad_norm": 3.0619022846221924, - "learning_rate": 3.855701494867679e-06, - "loss": 0.224, - "step": 652 - }, - { - "epoch": 3.1853658536585368, - "grad_norm": 3.3668978214263916, - "learning_rate": 3.852480930857275e-06, - "loss": 0.4029, - "step": 653 - }, - { - "epoch": 3.1902439024390246, - "grad_norm": 3.543147563934326, - "learning_rate": 3.849257190535356e-06, - "loss": 0.2096, - "step": 654 - }, - { - "epoch": 3.1951219512195124, - "grad_norm": 3.793619155883789, - "learning_rate": 3.846030281472902e-06, - "loss": 0.5574, - "step": 655 - }, - { - "epoch": 3.2, - "grad_norm": 3.021289110183716, - "learning_rate": 3.842800211248333e-06, - "loss": 0.2233, - "step": 656 - }, - { - "epoch": 3.204878048780488, - "grad_norm": 4.582934856414795, - "learning_rate": 3.839566987447492e-06, - "loss": 0.3871, - "step": 657 - }, - { - "epoch": 3.209756097560976, - "grad_norm": 2.996340274810791, - "learning_rate": 3.8363306176636296e-06, - "loss": 0.4325, - "step": 658 - }, - { - "epoch": 3.2146341463414636, - "grad_norm": 3.3190877437591553, - "learning_rate": 3.833091109497384e-06, - "loss": 0.5321, - "step": 659 - }, - { - "epoch": 3.2195121951219514, - "grad_norm": 3.2532856464385986, - "learning_rate": 3.829848470556765e-06, - "loss": 0.1359, - "step": 660 - }, - { - "epoch": 3.2243902439024392, - "grad_norm": 2.7875044345855713, - "learning_rate": 3.8266027084571335e-06, - "loss": 0.3145, - "step": 661 - }, - { - "epoch": 3.229268292682927, - "grad_norm": 3.748253583908081, - "learning_rate": 3.823353830821187e-06, - "loss": 0.1252, - "step": 662 - }, - { - "epoch": 3.234146341463415, - "grad_norm": 2.858293294906616, - "learning_rate": 3.820101845278937e-06, - "loss": 0.2589, - "step": 663 - }, - { - "epoch": 3.2390243902439027, - "grad_norm": 3.7470967769622803, - "learning_rate": 3.816846759467696e-06, - "loss": 0.2594, - "step": 664 - }, - { - "epoch": 3.2439024390243905, - "grad_norm": 3.676196813583374, - "learning_rate": 3.8135885810320587e-06, - "loss": 0.2998, - "step": 665 - }, - { - "epoch": 3.2487804878048783, - "grad_norm": 3.0943140983581543, - "learning_rate": 3.810327317623881e-06, - "loss": 0.2238, - "step": 666 - }, - { - "epoch": 3.253658536585366, - "grad_norm": 3.5907349586486816, - "learning_rate": 3.8070629769022628e-06, - "loss": 0.3381, - "step": 667 - }, - { - "epoch": 3.258536585365854, - "grad_norm": 3.1195285320281982, - "learning_rate": 3.8037955665335335e-06, - "loss": 0.2407, - "step": 668 - }, - { - "epoch": 3.2634146341463417, - "grad_norm": 3.422292947769165, - "learning_rate": 3.800525094191231e-06, - "loss": 0.2957, - "step": 669 - }, - { - "epoch": 3.2682926829268295, - "grad_norm": 2.5264663696289062, - "learning_rate": 3.797251567556083e-06, - "loss": 0.2493, - "step": 670 - }, - { - "epoch": 3.2731707317073173, - "grad_norm": 3.350219964981079, - "learning_rate": 3.793974994315991e-06, - "loss": 0.1186, - "step": 671 - }, - { - "epoch": 3.278048780487805, - "grad_norm": 4.175906181335449, - "learning_rate": 3.790695382166013e-06, - "loss": 0.3453, - "step": 672 - }, - { - "epoch": 3.2829268292682925, - "grad_norm": 3.006072521209717, - "learning_rate": 3.7874127388083415e-06, - "loss": 0.1981, - "step": 673 - }, - { - "epoch": 3.2878048780487803, - "grad_norm": 3.368561029434204, - "learning_rate": 3.7841270719522895e-06, - "loss": 0.2934, - "step": 674 - }, - { - "epoch": 3.292682926829268, - "grad_norm": 4.374331951141357, - "learning_rate": 3.7808383893142692e-06, - "loss": 0.1359, - "step": 675 - }, - { - "epoch": 3.297560975609756, - "grad_norm": 3.297102451324463, - "learning_rate": 3.7775466986177763e-06, - "loss": 0.2498, - "step": 676 - }, - { - "epoch": 3.3024390243902437, - "grad_norm": 2.8914761543273926, - "learning_rate": 3.774252007593371e-06, - "loss": 0.1308, - "step": 677 - }, - { - "epoch": 3.3073170731707315, - "grad_norm": 3.1550722122192383, - "learning_rate": 3.7709543239786593e-06, - "loss": 0.3915, - "step": 678 - }, - { - "epoch": 3.3121951219512193, - "grad_norm": 3.2302658557891846, - "learning_rate": 3.767653655518277e-06, - "loss": 0.2558, - "step": 679 - }, - { - "epoch": 3.317073170731707, - "grad_norm": 4.4321770668029785, - "learning_rate": 3.7643500099638673e-06, - "loss": 0.1988, - "step": 680 - }, - { - "epoch": 3.321951219512195, - "grad_norm": 2.970566749572754, - "learning_rate": 3.7610433950740667e-06, - "loss": 0.4908, - "step": 681 - }, - { - "epoch": 3.3268292682926828, - "grad_norm": 3.5516228675842285, - "learning_rate": 3.757733818614485e-06, - "loss": 0.304, - "step": 682 - }, - { - "epoch": 3.3317073170731706, - "grad_norm": 2.7555387020111084, - "learning_rate": 3.7544212883576856e-06, - "loss": 0.2533, - "step": 683 - }, - { - "epoch": 3.3365853658536584, - "grad_norm": 3.61226749420166, - "learning_rate": 3.751105812083172e-06, - "loss": 0.1771, - "step": 684 - }, - { - "epoch": 3.341463414634146, - "grad_norm": 3.0466206073760986, - "learning_rate": 3.7477873975773655e-06, - "loss": 0.4213, - "step": 685 - }, - { - "epoch": 3.346341463414634, - "grad_norm": 3.6091527938842773, - "learning_rate": 3.7444660526335853e-06, - "loss": 0.3808, - "step": 686 - }, - { - "epoch": 3.351219512195122, - "grad_norm": 3.8443002700805664, - "learning_rate": 3.741141785052036e-06, - "loss": 0.6438, - "step": 687 - }, - { - "epoch": 3.3560975609756096, - "grad_norm": 3.845909833908081, - "learning_rate": 3.737814602639784e-06, - "loss": 0.3686, - "step": 688 - }, - { - "epoch": 3.3609756097560974, - "grad_norm": 2.904892921447754, - "learning_rate": 3.7344845132107427e-06, - "loss": 0.2934, - "step": 689 - }, - { - "epoch": 3.3658536585365852, - "grad_norm": 3.4766387939453125, - "learning_rate": 3.731151524585651e-06, - "loss": 0.3299, - "step": 690 - }, - { - "epoch": 3.370731707317073, - "grad_norm": 4.236767768859863, - "learning_rate": 3.7278156445920584e-06, - "loss": 0.6303, - "step": 691 - }, - { - "epoch": 3.375609756097561, - "grad_norm": 3.1122591495513916, - "learning_rate": 3.724476881064303e-06, - "loss": 0.2432, - "step": 692 - }, - { - "epoch": 3.3804878048780487, - "grad_norm": 3.0971457958221436, - "learning_rate": 3.721135241843496e-06, - "loss": 0.3131, - "step": 693 - }, - { - "epoch": 3.3853658536585365, - "grad_norm": 3.9365804195404053, - "learning_rate": 3.7177907347775016e-06, - "loss": 0.3372, - "step": 694 - }, - { - "epoch": 3.3902439024390243, - "grad_norm": 3.760373115539551, - "learning_rate": 3.71444336772092e-06, - "loss": 0.5055, - "step": 695 - }, - { - "epoch": 3.395121951219512, - "grad_norm": 4.360848426818848, - "learning_rate": 3.711093148535068e-06, - "loss": 0.6183, - "step": 696 - }, - { - "epoch": 3.4, - "grad_norm": 3.7713537216186523, - "learning_rate": 3.707740085087959e-06, - "loss": 0.1568, - "step": 697 - }, - { - "epoch": 3.4048780487804877, - "grad_norm": 3.8532230854034424, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.2826, - "step": 698 - }, - { - "epoch": 3.4097560975609755, - "grad_norm": 3.0548605918884277, - "learning_rate": 3.701025456915411e-06, - "loss": 0.1918, - "step": 699 - }, - { - "epoch": 3.4146341463414633, - "grad_norm": 3.2431821823120117, - "learning_rate": 3.697663907959327e-06, - "loss": 0.2493, - "step": 700 - }, - { - "epoch": 3.419512195121951, - "grad_norm": 3.7301864624023438, - "learning_rate": 3.6942995462806574e-06, - "loss": 0.4913, - "step": 701 - }, - { - "epoch": 3.424390243902439, - "grad_norm": 2.5468900203704834, - "learning_rate": 3.6909323797806314e-06, - "loss": 0.1788, - "step": 702 - }, - { - "epoch": 3.4292682926829268, - "grad_norm": 3.3719515800476074, - "learning_rate": 3.6875624163670635e-06, - "loss": 0.4162, - "step": 703 - }, - { - "epoch": 3.4341463414634146, - "grad_norm": 3.528010368347168, - "learning_rate": 3.6841896639543394e-06, - "loss": 0.1924, - "step": 704 - }, - { - "epoch": 3.4390243902439024, - "grad_norm": 3.3636631965637207, - "learning_rate": 3.6808141304633924e-06, - "loss": 0.3177, - "step": 705 - }, - { - "epoch": 3.44390243902439, - "grad_norm": 3.418705463409424, - "learning_rate": 3.6774358238216878e-06, - "loss": 0.2301, - "step": 706 - }, - { - "epoch": 3.448780487804878, - "grad_norm": 4.720373630523682, - "learning_rate": 3.6740547519632048e-06, - "loss": 0.1894, - "step": 707 - }, - { - "epoch": 3.453658536585366, - "grad_norm": 2.9635703563690186, - "learning_rate": 3.670670922828414e-06, - "loss": 0.2642, - "step": 708 - }, - { - "epoch": 3.4585365853658536, - "grad_norm": 4.934754371643066, - "learning_rate": 3.667284344364264e-06, - "loss": 0.2275, - "step": 709 - }, - { - "epoch": 3.4634146341463414, - "grad_norm": 3.090585231781006, - "learning_rate": 3.6638950245241604e-06, - "loss": 0.4447, - "step": 710 - }, - { - "epoch": 3.4682926829268292, - "grad_norm": 4.360495090484619, - "learning_rate": 3.660502971267945e-06, - "loss": 0.2415, - "step": 711 - }, - { - "epoch": 3.473170731707317, - "grad_norm": 3.4893476963043213, - "learning_rate": 3.65710819256188e-06, - "loss": 0.0921, - "step": 712 - }, - { - "epoch": 3.478048780487805, - "grad_norm": 3.2423770427703857, - "learning_rate": 3.65371069637863e-06, - "loss": 0.2371, - "step": 713 - }, - { - "epoch": 3.4829268292682927, - "grad_norm": 3.0775890350341797, - "learning_rate": 3.650310490697238e-06, - "loss": 0.4026, - "step": 714 - }, - { - "epoch": 3.4878048780487805, - "grad_norm": 3.906625270843506, - "learning_rate": 3.646907583503114e-06, - "loss": 0.4312, - "step": 715 - }, - { - "epoch": 3.4926829268292683, - "grad_norm": 3.2140414714813232, - "learning_rate": 3.6435019827880093e-06, - "loss": 0.2309, - "step": 716 - }, - { - "epoch": 3.497560975609756, - "grad_norm": 3.048523426055908, - "learning_rate": 3.640093696550003e-06, - "loss": 0.296, - "step": 717 - }, - { - "epoch": 3.502439024390244, - "grad_norm": 2.9669039249420166, - "learning_rate": 3.6366827327934817e-06, - "loss": 0.2723, - "step": 718 - }, - { - "epoch": 3.5073170731707317, - "grad_norm": 3.6941726207733154, - "learning_rate": 3.6332690995291176e-06, - "loss": 0.3797, - "step": 719 - }, - { - "epoch": 3.5121951219512195, - "grad_norm": 5.135766506195068, - "learning_rate": 3.6298528047738545e-06, - "loss": 0.9868, - "step": 720 - }, - { - "epoch": 3.5170731707317073, - "grad_norm": 3.2021052837371826, - "learning_rate": 3.626433856550886e-06, - "loss": 0.4069, - "step": 721 - }, - { - "epoch": 3.521951219512195, - "grad_norm": 3.094444513320923, - "learning_rate": 3.623012262889637e-06, - "loss": 0.3368, - "step": 722 - }, - { - "epoch": 3.526829268292683, - "grad_norm": 3.609285354614258, - "learning_rate": 3.6195880318257465e-06, - "loss": 0.3972, - "step": 723 - }, - { - "epoch": 3.5317073170731708, - "grad_norm": 4.236501216888428, - "learning_rate": 3.616161171401046e-06, - "loss": 0.52, - "step": 724 - }, - { - "epoch": 3.5365853658536586, - "grad_norm": 3.504526376724243, - "learning_rate": 3.612731689663542e-06, - "loss": 0.23, - "step": 725 - }, - { - "epoch": 3.5414634146341464, - "grad_norm": 3.233591079711914, - "learning_rate": 3.6092995946673996e-06, - "loss": 0.4151, - "step": 726 - }, - { - "epoch": 3.546341463414634, - "grad_norm": 3.6701886653900146, - "learning_rate": 3.605864894472918e-06, - "loss": 0.2798, - "step": 727 - }, - { - "epoch": 3.551219512195122, - "grad_norm": 3.8713181018829346, - "learning_rate": 3.602427597146516e-06, - "loss": 0.4336, - "step": 728 - }, - { - "epoch": 3.55609756097561, - "grad_norm": 5.49612283706665, - "learning_rate": 3.5989877107607134e-06, - "loss": 0.4803, - "step": 729 - }, - { - "epoch": 3.5609756097560976, - "grad_norm": 3.771005392074585, - "learning_rate": 3.5955452433941075e-06, - "loss": 0.3698, - "step": 730 - }, - { - "epoch": 3.5658536585365854, - "grad_norm": 2.970822334289551, - "learning_rate": 3.5921002031313586e-06, - "loss": 0.2373, - "step": 731 - }, - { - "epoch": 3.5707317073170732, - "grad_norm": 3.517249584197998, - "learning_rate": 3.58865259806317e-06, - "loss": 0.1908, - "step": 732 - }, - { - "epoch": 3.575609756097561, - "grad_norm": 3.6825428009033203, - "learning_rate": 3.585202436286267e-06, - "loss": 0.3993, - "step": 733 - }, - { - "epoch": 3.580487804878049, - "grad_norm": 3.387479066848755, - "learning_rate": 3.581749725903381e-06, - "loss": 0.4237, - "step": 734 - }, - { - "epoch": 3.5853658536585367, - "grad_norm": 3.5004806518554688, - "learning_rate": 3.5782944750232274e-06, - "loss": 0.3011, - "step": 735 - }, - { - "epoch": 3.5902439024390245, - "grad_norm": 3.461731433868408, - "learning_rate": 3.574836691760489e-06, - "loss": 0.0896, - "step": 736 - }, - { - "epoch": 3.5951219512195123, - "grad_norm": 3.9598381519317627, - "learning_rate": 3.571376384235795e-06, - "loss": 0.2751, - "step": 737 - }, - { - "epoch": 3.6, - "grad_norm": 4.053933143615723, - "learning_rate": 3.5679135605757035e-06, - "loss": 0.2086, - "step": 738 - }, - { - "epoch": 3.604878048780488, - "grad_norm": 2.9683544635772705, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1659, - "step": 739 - }, - { - "epoch": 3.6097560975609757, - "grad_norm": 3.6598448753356934, - "learning_rate": 3.5609803973850877e-06, - "loss": 0.2469, - "step": 740 - }, - { - "epoch": 3.6146341463414635, - "grad_norm": 3.449335813522339, - "learning_rate": 3.557510074137147e-06, - "loss": 0.375, - "step": 741 - }, - { - "epoch": 3.6195121951219513, - "grad_norm": 2.7666923999786377, - "learning_rate": 3.554037267318942e-06, - "loss": 0.3133, - "step": 742 - }, - { - "epoch": 3.624390243902439, - "grad_norm": 2.8951869010925293, - "learning_rate": 3.5505619850863847e-06, - "loss": 0.2243, - "step": 743 - }, - { - "epoch": 3.629268292682927, - "grad_norm": 3.477747678756714, - "learning_rate": 3.5470842356012007e-06, - "loss": 0.1321, - "step": 744 - }, - { - "epoch": 3.6341463414634148, - "grad_norm": 3.810480833053589, - "learning_rate": 3.5436040270309113e-06, - "loss": 0.361, - "step": 745 - }, - { - "epoch": 3.6390243902439026, - "grad_norm": 3.0730793476104736, - "learning_rate": 3.540121367548811e-06, - "loss": 0.1523, - "step": 746 - }, - { - "epoch": 3.6439024390243904, - "grad_norm": 3.6878390312194824, - "learning_rate": 3.5366362653339524e-06, - "loss": 0.4898, - "step": 747 - }, - { - "epoch": 3.648780487804878, - "grad_norm": 3.6432242393493652, - "learning_rate": 3.533148728571124e-06, - "loss": 0.1397, - "step": 748 - }, - { - "epoch": 3.653658536585366, - "grad_norm": 3.7047760486602783, - "learning_rate": 3.5296587654508317e-06, - "loss": 0.323, - "step": 749 - }, - { - "epoch": 3.658536585365854, - "grad_norm": 3.777132749557495, - "learning_rate": 3.526166384169279e-06, - "loss": 0.5577, - "step": 750 - }, - { - "epoch": 3.6634146341463416, - "grad_norm": 3.7970924377441406, - "learning_rate": 3.5226715929283507e-06, - "loss": 0.245, - "step": 751 - }, - { - "epoch": 3.6682926829268294, - "grad_norm": 2.8203537464141846, - "learning_rate": 3.519174399935588e-06, - "loss": 0.1619, - "step": 752 - }, - { - "epoch": 3.6731707317073172, - "grad_norm": 3.4040987491607666, - "learning_rate": 3.5156748134041767e-06, - "loss": 0.1047, - "step": 753 - }, - { - "epoch": 3.678048780487805, - "grad_norm": 3.927960157394409, - "learning_rate": 3.5121728415529203e-06, - "loss": 0.5713, - "step": 754 - }, - { - "epoch": 3.682926829268293, - "grad_norm": 3.3833277225494385, - "learning_rate": 3.5086684926062266e-06, - "loss": 0.2174, - "step": 755 - }, - { - "epoch": 3.68780487804878, - "grad_norm": 3.989307403564453, - "learning_rate": 3.505161774794085e-06, - "loss": 0.285, - "step": 756 - }, - { - "epoch": 3.692682926829268, - "grad_norm": 2.742429494857788, - "learning_rate": 3.5016526963520474e-06, - "loss": 0.1602, - "step": 757 - }, - { - "epoch": 3.697560975609756, - "grad_norm": 3.7082698345184326, - "learning_rate": 3.498141265521212e-06, - "loss": 0.666, - "step": 758 - }, - { - "epoch": 3.7024390243902436, - "grad_norm": 3.033196210861206, - "learning_rate": 3.4946274905481997e-06, - "loss": 0.2024, - "step": 759 - }, - { - "epoch": 3.7073170731707314, - "grad_norm": 3.7145371437072754, - "learning_rate": 3.4911113796851364e-06, - "loss": 0.2719, - "step": 760 - }, - { - "epoch": 3.7121951219512193, - "grad_norm": 3.580298900604248, - "learning_rate": 3.487592941189636e-06, - "loss": 0.1537, - "step": 761 - }, - { - "epoch": 3.717073170731707, - "grad_norm": 4.753757953643799, - "learning_rate": 3.484072183324776e-06, - "loss": 0.6149, - "step": 762 - }, - { - "epoch": 3.721951219512195, - "grad_norm": 3.5575687885284424, - "learning_rate": 3.4805491143590823e-06, - "loss": 0.4241, - "step": 763 - }, - { - "epoch": 3.7268292682926827, - "grad_norm": 3.215224266052246, - "learning_rate": 3.4770237425665103e-06, - "loss": 0.3037, - "step": 764 - }, - { - "epoch": 3.7317073170731705, - "grad_norm": 2.9899685382843018, - "learning_rate": 3.4734960762264204e-06, - "loss": 0.4854, - "step": 765 - }, - { - "epoch": 3.7365853658536583, - "grad_norm": 3.5880227088928223, - "learning_rate": 3.469966123623563e-06, - "loss": 0.3849, - "step": 766 - }, - { - "epoch": 3.741463414634146, - "grad_norm": 3.472750186920166, - "learning_rate": 3.46643389304806e-06, - "loss": 0.3159, - "step": 767 - }, - { - "epoch": 3.746341463414634, - "grad_norm": 4.355650901794434, - "learning_rate": 3.4628993927953786e-06, - "loss": 0.7527, - "step": 768 - }, - { - "epoch": 3.7512195121951217, - "grad_norm": 2.94575834274292, - "learning_rate": 3.45936263116632e-06, - "loss": 0.1716, - "step": 769 - }, - { - "epoch": 3.7560975609756095, - "grad_norm": 2.991525173187256, - "learning_rate": 3.4558236164669957e-06, - "loss": 0.2061, - "step": 770 - }, - { - "epoch": 3.7609756097560973, - "grad_norm": 3.134000301361084, - "learning_rate": 3.4522823570088073e-06, - "loss": 0.1338, - "step": 771 - }, - { - "epoch": 3.765853658536585, - "grad_norm": 3.722140312194824, - "learning_rate": 3.4487388611084295e-06, - "loss": 0.2615, - "step": 772 - }, - { - "epoch": 3.770731707317073, - "grad_norm": 3.7941153049468994, - "learning_rate": 3.445193137087788e-06, - "loss": 0.1401, - "step": 773 - }, - { - "epoch": 3.7756097560975608, - "grad_norm": 2.872941732406616, - "learning_rate": 3.4416451932740424e-06, - "loss": 0.2934, - "step": 774 - }, - { - "epoch": 3.7804878048780486, - "grad_norm": 4.5019941329956055, - "learning_rate": 3.4380950379995652e-06, - "loss": 0.4579, - "step": 775 - }, - { - "epoch": 3.7853658536585364, - "grad_norm": 2.682884931564331, - "learning_rate": 3.434542679601922e-06, - "loss": 0.2979, - "step": 776 - }, - { - "epoch": 3.790243902439024, - "grad_norm": 3.3044273853302, - "learning_rate": 3.4309881264238538e-06, - "loss": 0.1196, - "step": 777 - }, - { - "epoch": 3.795121951219512, - "grad_norm": 3.102760076522827, - "learning_rate": 3.4274313868132547e-06, - "loss": 0.2026, - "step": 778 - }, - { - "epoch": 3.8, - "grad_norm": 3.3304500579833984, - "learning_rate": 3.4238724691231534e-06, - "loss": 0.2135, - "step": 779 - }, - { - "epoch": 3.8048780487804876, - "grad_norm": 3.295119047164917, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.4418, - "step": 780 - }, - { - "epoch": 3.8097560975609754, - "grad_norm": 3.6655640602111816, - "learning_rate": 3.4167481329421204e-06, - "loss": 0.203, - "step": 781 - }, - { - "epoch": 3.8146341463414632, - "grad_norm": 3.387830972671509, - "learning_rate": 3.4131827311827447e-06, - "loss": 0.3225, - "step": 782 - }, - { - "epoch": 3.819512195121951, - "grad_norm": 2.621633529663086, - "learning_rate": 3.4096151848069416e-06, - "loss": 0.1704, - "step": 783 - }, - { - "epoch": 3.824390243902439, - "grad_norm": 2.974344491958618, - "learning_rate": 3.4060455021931195e-06, - "loss": 0.2785, - "step": 784 - }, - { - "epoch": 3.8292682926829267, - "grad_norm": 3.452131748199463, - "learning_rate": 3.402473691724704e-06, - "loss": 0.223, - "step": 785 - }, - { - "epoch": 3.8341463414634145, - "grad_norm": 2.6373705863952637, - "learning_rate": 3.39889976179012e-06, - "loss": 0.2368, - "step": 786 - }, - { - "epoch": 3.8390243902439023, - "grad_norm": 2.863184928894043, - "learning_rate": 3.3953237207827673e-06, - "loss": 0.3294, - "step": 787 - }, - { - "epoch": 3.84390243902439, - "grad_norm": 5.104704856872559, - "learning_rate": 3.391745577101005e-06, - "loss": 0.5431, - "step": 788 - }, - { - "epoch": 3.848780487804878, - "grad_norm": 3.951310634613037, - "learning_rate": 3.3881653391481306e-06, - "loss": 0.2546, - "step": 789 - }, - { - "epoch": 3.8536585365853657, - "grad_norm": 3.9903225898742676, - "learning_rate": 3.384583015332359e-06, - "loss": 0.3293, - "step": 790 - }, - { - "epoch": 3.8585365853658535, - "grad_norm": 3.3149220943450928, - "learning_rate": 3.380998614066805e-06, - "loss": 0.1861, - "step": 791 - }, - { - "epoch": 3.8634146341463413, - "grad_norm": 3.6755223274230957, - "learning_rate": 3.3774121437694606e-06, - "loss": 0.2498, - "step": 792 - }, - { - "epoch": 3.868292682926829, - "grad_norm": 3.192918300628662, - "learning_rate": 3.3738236128631786e-06, - "loss": 0.1525, - "step": 793 - }, - { - "epoch": 3.873170731707317, - "grad_norm": 3.5358777046203613, - "learning_rate": 3.3702330297756503e-06, - "loss": 0.3622, - "step": 794 - }, - { - "epoch": 3.8780487804878048, - "grad_norm": 3.619878053665161, - "learning_rate": 3.366640402939387e-06, - "loss": 0.1051, - "step": 795 - }, - { - "epoch": 3.8829268292682926, - "grad_norm": 7.085352420806885, - "learning_rate": 3.363045740791698e-06, - "loss": 0.4606, - "step": 796 - }, - { - "epoch": 3.8878048780487804, - "grad_norm": 2.523165464401245, - "learning_rate": 3.3594490517746774e-06, - "loss": 0.2267, - "step": 797 - }, - { - "epoch": 3.892682926829268, - "grad_norm": 2.7026922702789307, - "learning_rate": 3.3558503443351733e-06, - "loss": 0.2792, - "step": 798 - }, - { - "epoch": 3.897560975609756, - "grad_norm": 2.9232428073883057, - "learning_rate": 3.352249626924777e-06, - "loss": 0.2579, - "step": 799 - }, - { - "epoch": 3.902439024390244, - "grad_norm": 4.760788440704346, - "learning_rate": 3.348646907999801e-06, - "loss": 0.6983, - "step": 800 - }, - { - "epoch": 3.9073170731707316, - "grad_norm": 3.198249578475952, - "learning_rate": 3.345042196021257e-06, - "loss": 0.3265, - "step": 801 - }, - { - "epoch": 3.9121951219512194, - "grad_norm": 4.069286823272705, - "learning_rate": 3.3414354994548385e-06, - "loss": 0.497, - "step": 802 - }, - { - "epoch": 3.9170731707317072, - "grad_norm": 3.4435410499572754, - "learning_rate": 3.337826826770898e-06, - "loss": 0.2812, - "step": 803 - }, - { - "epoch": 3.921951219512195, - "grad_norm": 3.9805212020874023, - "learning_rate": 3.3342161864444312e-06, - "loss": 0.2277, - "step": 804 - }, - { - "epoch": 3.926829268292683, - "grad_norm": 3.348925828933716, - "learning_rate": 3.3306035869550534e-06, - "loss": 0.1614, - "step": 805 - }, - { - "epoch": 3.9317073170731707, - "grad_norm": 4.7613701820373535, - "learning_rate": 3.326989036786981e-06, - "loss": 0.3269, - "step": 806 - }, - { - "epoch": 3.9365853658536585, - "grad_norm": 3.807502508163452, - "learning_rate": 3.3233725444290126e-06, - "loss": 0.2619, - "step": 807 - }, - { - "epoch": 3.9414634146341463, - "grad_norm": 3.2690203189849854, - "learning_rate": 3.3197541183745065e-06, - "loss": 0.4334, - "step": 808 - }, - { - "epoch": 3.946341463414634, - "grad_norm": 3.396993398666382, - "learning_rate": 3.3161337671213634e-06, - "loss": 0.2738, - "step": 809 - }, - { - "epoch": 3.951219512195122, - "grad_norm": 3.086669921875, - "learning_rate": 3.312511499172006e-06, - "loss": 0.1597, - "step": 810 - }, - { - "epoch": 3.9560975609756097, - "grad_norm": 3.5688745975494385, - "learning_rate": 3.3088873230333562e-06, - "loss": 0.3195, - "step": 811 - }, - { - "epoch": 3.9609756097560975, - "grad_norm": 3.4843621253967285, - "learning_rate": 3.3052612472168193e-06, - "loss": 0.1865, - "step": 812 - }, - { - "epoch": 3.9658536585365853, - "grad_norm": 2.8479580879211426, - "learning_rate": 3.3016332802382618e-06, - "loss": 0.3108, - "step": 813 - }, - { - "epoch": 3.970731707317073, - "grad_norm": 3.3241543769836426, - "learning_rate": 3.2980034306179897e-06, - "loss": 0.2099, - "step": 814 - }, - { - "epoch": 3.975609756097561, - "grad_norm": 2.817675828933716, - "learning_rate": 3.294371706880733e-06, - "loss": 0.3073, - "step": 815 - }, - { - "epoch": 3.9804878048780488, - "grad_norm": 2.9535388946533203, - "learning_rate": 3.290738117555622e-06, - "loss": 0.2024, - "step": 816 - }, - { - "epoch": 3.9853658536585366, - "grad_norm": 5.021281719207764, - "learning_rate": 3.2871026711761666e-06, - "loss": 0.508, - "step": 817 - }, - { - "epoch": 3.9902439024390244, - "grad_norm": 3.3377649784088135, - "learning_rate": 3.2834653762802414e-06, - "loss": 0.2116, - "step": 818 - }, - { - "epoch": 3.995121951219512, - "grad_norm": 4.412073135375977, - "learning_rate": 3.2798262414100594e-06, - "loss": 0.2177, - "step": 819 - }, - { - "epoch": 4.0, - "grad_norm": 3.174323797225952, - "learning_rate": 3.2761852751121566e-06, - "loss": 0.1737, - "step": 820 - }, - { - "epoch": 4.004878048780488, - "grad_norm": 2.921494960784912, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2569, - "step": 821 - }, - { - "epoch": 4.009756097560976, - "grad_norm": 2.693495512008667, - "learning_rate": 3.2688978824408136e-06, - "loss": 0.1621, - "step": 822 - }, - { - "epoch": 4.014634146341463, - "grad_norm": 2.705796718597412, - "learning_rate": 3.2652514731818698e-06, - "loss": 0.1121, - "step": 823 - }, - { - "epoch": 4.019512195121951, - "grad_norm": 3.2621448040008545, - "learning_rate": 3.2616032667241564e-06, - "loss": 0.0835, - "step": 824 - }, - { - "epoch": 4.024390243902439, - "grad_norm": 3.6205084323883057, - "learning_rate": 3.257953271635513e-06, - "loss": 0.3731, - "step": 825 - }, - { - "epoch": 4.029268292682927, - "grad_norm": 3.2600371837615967, - "learning_rate": 3.2543014964879814e-06, - "loss": 0.1051, - "step": 826 - }, - { - "epoch": 4.034146341463415, - "grad_norm": 3.865178346633911, - "learning_rate": 3.250647949857781e-06, - "loss": 0.0916, - "step": 827 - }, - { - "epoch": 4.0390243902439025, - "grad_norm": 6.9700927734375, - "learning_rate": 3.2469926403252932e-06, - "loss": 0.4037, - "step": 828 - }, - { - "epoch": 4.04390243902439, - "grad_norm": 3.658712148666382, - "learning_rate": 3.2433355764750417e-06, - "loss": 0.0523, - "step": 829 - }, - { - "epoch": 4.048780487804878, - "grad_norm": 4.911301612854004, - "learning_rate": 3.2396767668956656e-06, - "loss": 0.2616, - "step": 830 - }, - { - "epoch": 4.053658536585366, - "grad_norm": 5.019360542297363, - "learning_rate": 3.2360162201799085e-06, - "loss": 0.195, - "step": 831 - }, - { - "epoch": 4.058536585365854, - "grad_norm": 3.493767261505127, - "learning_rate": 3.2323539449245906e-06, - "loss": 0.1245, - "step": 832 - }, - { - "epoch": 4.0634146341463415, - "grad_norm": 4.246248722076416, - "learning_rate": 3.2286899497305917e-06, - "loss": 0.1147, - "step": 833 - }, - { - "epoch": 4.068292682926829, - "grad_norm": 2.993704319000244, - "learning_rate": 3.2250242432028335e-06, - "loss": 0.2189, - "step": 834 - }, - { - "epoch": 4.073170731707317, - "grad_norm": 4.695023059844971, - "learning_rate": 3.221356833950254e-06, - "loss": 0.4685, - "step": 835 - }, - { - "epoch": 4.078048780487805, - "grad_norm": 2.777644634246826, - "learning_rate": 3.21768773058579e-06, - "loss": 0.1245, - "step": 836 - }, - { - "epoch": 4.082926829268293, - "grad_norm": 3.3545901775360107, - "learning_rate": 3.21401694172636e-06, - "loss": 0.1342, - "step": 837 - }, - { - "epoch": 4.087804878048781, - "grad_norm": 2.2222652435302734, - "learning_rate": 3.2103444759928383e-06, - "loss": 0.0484, - "step": 838 - }, - { - "epoch": 4.092682926829268, - "grad_norm": 2.580345630645752, - "learning_rate": 3.2066703420100377e-06, - "loss": 0.0592, - "step": 839 - }, - { - "epoch": 4.097560975609756, - "grad_norm": 3.8652923107147217, - "learning_rate": 3.2029945484066883e-06, - "loss": 0.2536, - "step": 840 - }, - { - "epoch": 4.102439024390244, - "grad_norm": 3.0441582202911377, - "learning_rate": 3.1993171038154203e-06, - "loss": 0.1221, - "step": 841 - }, - { - "epoch": 4.107317073170732, - "grad_norm": 2.2795114517211914, - "learning_rate": 3.1956380168727385e-06, - "loss": 0.1231, - "step": 842 - }, - { - "epoch": 4.11219512195122, - "grad_norm": 3.701009750366211, - "learning_rate": 3.191957296219007e-06, - "loss": 0.2144, - "step": 843 - }, - { - "epoch": 4.117073170731707, - "grad_norm": 3.452637195587158, - "learning_rate": 3.1882749504984247e-06, - "loss": 0.1026, - "step": 844 - }, - { - "epoch": 4.121951219512195, - "grad_norm": 2.4208810329437256, - "learning_rate": 3.1845909883590076e-06, - "loss": 0.1124, - "step": 845 - }, - { - "epoch": 4.126829268292683, - "grad_norm": 4.353063583374023, - "learning_rate": 3.180905418452569e-06, - "loss": 0.2804, - "step": 846 - }, - { - "epoch": 4.131707317073171, - "grad_norm": 3.1151084899902344, - "learning_rate": 3.1772182494346963e-06, - "loss": 0.1748, - "step": 847 - }, - { - "epoch": 4.136585365853659, - "grad_norm": 3.457940101623535, - "learning_rate": 3.1735294899647344e-06, - "loss": 0.1984, - "step": 848 - }, - { - "epoch": 4.1414634146341465, - "grad_norm": 3.3556935787200928, - "learning_rate": 3.169839148705762e-06, - "loss": 0.1332, - "step": 849 - }, - { - "epoch": 4.146341463414634, - "grad_norm": 3.5510823726654053, - "learning_rate": 3.1661472343245725e-06, - "loss": 0.4788, - "step": 850 - }, - { - "epoch": 4.151219512195122, - "grad_norm": 4.036712646484375, - "learning_rate": 3.162453755491655e-06, - "loss": 0.2437, - "step": 851 - }, - { - "epoch": 4.15609756097561, - "grad_norm": 4.417062282562256, - "learning_rate": 3.158758720881171e-06, - "loss": 0.203, - "step": 852 - }, - { - "epoch": 4.160975609756098, - "grad_norm": 3.920558214187622, - "learning_rate": 3.155062139170937e-06, - "loss": 0.1462, - "step": 853 - }, - { - "epoch": 4.1658536585365855, - "grad_norm": 6.472081661224365, - "learning_rate": 3.1513640190424034e-06, - "loss": 0.0972, - "step": 854 - }, - { - "epoch": 4.170731707317073, - "grad_norm": 3.975947141647339, - "learning_rate": 3.147664369180632e-06, - "loss": 0.1092, - "step": 855 - }, - { - "epoch": 4.175609756097561, - "grad_norm": 4.977376937866211, - "learning_rate": 3.143963198274278e-06, - "loss": 0.2215, - "step": 856 - }, - { - "epoch": 4.180487804878049, - "grad_norm": 3.595460891723633, - "learning_rate": 3.140260515015569e-06, - "loss": 0.1771, - "step": 857 - }, - { - "epoch": 4.185365853658537, - "grad_norm": 3.1085658073425293, - "learning_rate": 3.136556328100284e-06, - "loss": 0.1995, - "step": 858 - }, - { - "epoch": 4.190243902439025, - "grad_norm": 4.355626583099365, - "learning_rate": 3.132850646227734e-06, - "loss": 0.4048, - "step": 859 - }, - { - "epoch": 4.195121951219512, - "grad_norm": 3.8079614639282227, - "learning_rate": 3.12914347810074e-06, - "loss": 0.1914, - "step": 860 - }, - { - "epoch": 4.2, - "grad_norm": 3.725804328918457, - "learning_rate": 3.125434832425613e-06, - "loss": 0.1579, - "step": 861 - }, - { - "epoch": 4.204878048780488, - "grad_norm": 2.974649667739868, - "learning_rate": 3.121724717912138e-06, - "loss": 0.1814, - "step": 862 - }, - { - "epoch": 4.209756097560976, - "grad_norm": 3.6391279697418213, - "learning_rate": 3.118013143273542e-06, - "loss": 0.1481, - "step": 863 - }, - { - "epoch": 4.214634146341464, - "grad_norm": 3.216643810272217, - "learning_rate": 3.1143001172264893e-06, - "loss": 0.113, - "step": 864 - }, - { - "epoch": 4.219512195121951, - "grad_norm": 3.605855941772461, - "learning_rate": 3.1105856484910474e-06, - "loss": 0.1405, - "step": 865 - }, - { - "epoch": 4.224390243902439, - "grad_norm": 2.7186765670776367, - "learning_rate": 3.1068697457906736e-06, - "loss": 0.097, - "step": 866 - }, - { - "epoch": 4.229268292682927, - "grad_norm": 3.980973243713379, - "learning_rate": 3.1031524178521938e-06, - "loss": 0.2207, - "step": 867 - }, - { - "epoch": 4.234146341463415, - "grad_norm": 3.4623806476593018, - "learning_rate": 3.0994336734057804e-06, - "loss": 0.0552, - "step": 868 - }, - { - "epoch": 4.239024390243903, - "grad_norm": 3.7556748390197754, - "learning_rate": 3.0957135211849315e-06, - "loss": 0.1743, - "step": 869 - }, - { - "epoch": 4.2439024390243905, - "grad_norm": 3.3547914028167725, - "learning_rate": 3.0919919699264535e-06, - "loss": 0.1195, - "step": 870 - }, - { - "epoch": 4.248780487804878, - "grad_norm": 4.392014503479004, - "learning_rate": 3.0882690283704355e-06, - "loss": 0.6174, - "step": 871 - }, - { - "epoch": 4.253658536585366, - "grad_norm": 2.7031409740448, - "learning_rate": 3.084544705260234e-06, - "loss": 0.1359, - "step": 872 - }, - { - "epoch": 4.258536585365854, - "grad_norm": 2.3518481254577637, - "learning_rate": 3.080819009342451e-06, - "loss": 0.0786, - "step": 873 - }, - { - "epoch": 4.263414634146342, - "grad_norm": 2.636204481124878, - "learning_rate": 3.077091949366908e-06, - "loss": 0.0677, - "step": 874 - }, - { - "epoch": 4.2682926829268295, - "grad_norm": 2.8670942783355713, - "learning_rate": 3.073363534086636e-06, - "loss": 0.1084, - "step": 875 - }, - { - "epoch": 4.273170731707317, - "grad_norm": 2.7044737339019775, - "learning_rate": 3.0696337722578444e-06, - "loss": 0.0681, - "step": 876 - }, - { - "epoch": 4.278048780487805, - "grad_norm": 3.481539487838745, - "learning_rate": 3.0659026726399072e-06, - "loss": 0.2262, - "step": 877 - }, - { - "epoch": 4.282926829268293, - "grad_norm": 3.7746224403381348, - "learning_rate": 3.0621702439953393e-06, - "loss": 0.2169, - "step": 878 - }, - { - "epoch": 4.287804878048781, - "grad_norm": 3.6386263370513916, - "learning_rate": 3.0584364950897768e-06, - "loss": 0.0581, - "step": 879 - }, - { - "epoch": 4.2926829268292686, - "grad_norm": 3.389408588409424, - "learning_rate": 3.0547014346919574e-06, - "loss": 0.1687, - "step": 880 - }, - { - "epoch": 4.297560975609756, - "grad_norm": 3.6510157585144043, - "learning_rate": 3.0509650715736977e-06, - "loss": 0.1362, - "step": 881 - }, - { - "epoch": 4.302439024390244, - "grad_norm": 3.334210157394409, - "learning_rate": 3.0472274145098744e-06, - "loss": 0.1865, - "step": 882 - }, - { - "epoch": 4.307317073170732, - "grad_norm": 4.747341632843018, - "learning_rate": 3.0434884722784026e-06, - "loss": 0.2385, - "step": 883 - }, - { - "epoch": 4.31219512195122, - "grad_norm": 3.9266858100891113, - "learning_rate": 3.0397482536602168e-06, - "loss": 0.1004, - "step": 884 - }, - { - "epoch": 4.317073170731708, - "grad_norm": 2.984821081161499, - "learning_rate": 3.0360067674392475e-06, - "loss": 0.1469, - "step": 885 - }, - { - "epoch": 4.321951219512195, - "grad_norm": 2.6379380226135254, - "learning_rate": 3.0322640224024024e-06, - "loss": 0.0829, - "step": 886 - }, - { - "epoch": 4.326829268292683, - "grad_norm": 3.885495185852051, - "learning_rate": 3.0285200273395478e-06, - "loss": 0.2256, - "step": 887 - }, - { - "epoch": 4.331707317073171, - "grad_norm": 3.950394868850708, - "learning_rate": 3.024774791043481e-06, - "loss": 0.2402, - "step": 888 - }, - { - "epoch": 4.336585365853659, - "grad_norm": 4.147830963134766, - "learning_rate": 3.021028322309921e-06, - "loss": 0.2198, - "step": 889 - }, - { - "epoch": 4.341463414634147, - "grad_norm": 4.0821638107299805, - "learning_rate": 3.0172806299374734e-06, - "loss": 0.2304, - "step": 890 - }, - { - "epoch": 4.3463414634146345, - "grad_norm": 4.142312049865723, - "learning_rate": 3.0135317227276247e-06, - "loss": 0.2864, - "step": 891 - }, - { - "epoch": 4.351219512195122, - "grad_norm": 3.008504867553711, - "learning_rate": 3.0097816094847104e-06, - "loss": 0.2045, - "step": 892 - }, - { - "epoch": 4.35609756097561, - "grad_norm": 3.1674623489379883, - "learning_rate": 3.0060302990158984e-06, - "loss": 0.0864, - "step": 893 - }, - { - "epoch": 4.360975609756098, - "grad_norm": 3.3412492275238037, - "learning_rate": 3.002277800131171e-06, - "loss": 0.076, - "step": 894 - }, - { - "epoch": 4.365853658536586, - "grad_norm": 3.067330837249756, - "learning_rate": 2.998524121643298e-06, - "loss": 0.1724, - "step": 895 - }, - { - "epoch": 4.3707317073170735, - "grad_norm": 3.9015982151031494, - "learning_rate": 2.994769272367822e-06, - "loss": 0.2, - "step": 896 - }, - { - "epoch": 4.375609756097561, - "grad_norm": 3.0136911869049072, - "learning_rate": 2.991013261123035e-06, - "loss": 0.0852, - "step": 897 - }, - { - "epoch": 4.380487804878049, - "grad_norm": 3.6834237575531006, - "learning_rate": 2.9872560967299554e-06, - "loss": 0.1449, - "step": 898 - }, - { - "epoch": 4.385365853658537, - "grad_norm": 3.3486039638519287, - "learning_rate": 2.9834977880123132e-06, - "loss": 0.0659, - "step": 899 - }, - { - "epoch": 4.390243902439025, - "grad_norm": 2.971315622329712, - "learning_rate": 2.9797383437965243e-06, - "loss": 0.1114, - "step": 900 - }, - { - "epoch": 4.3951219512195125, - "grad_norm": 2.683359146118164, - "learning_rate": 2.975977772911671e-06, - "loss": 0.0822, - "step": 901 - }, - { - "epoch": 4.4, - "grad_norm": 2.9941935539245605, - "learning_rate": 2.972216084189482e-06, - "loss": 0.0858, - "step": 902 - }, - { - "epoch": 4.404878048780488, - "grad_norm": 2.4938626289367676, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.1162, - "step": 903 - }, - { - "epoch": 4.409756097560976, - "grad_norm": 2.9364712238311768, - "learning_rate": 2.964689388573118e-06, - "loss": 0.0821, - "step": 904 - }, - { - "epoch": 4.414634146341464, - "grad_norm": 3.3638134002685547, - "learning_rate": 2.9609243993554434e-06, - "loss": 0.25, - "step": 905 - }, - { - "epoch": 4.419512195121952, - "grad_norm": 3.657277822494507, - "learning_rate": 2.9571583276533923e-06, - "loss": 0.0852, - "step": 906 - }, - { - "epoch": 4.424390243902439, - "grad_norm": 5.486263275146484, - "learning_rate": 2.9533911823116124e-06, - "loss": 0.5123, - "step": 907 - }, - { - "epoch": 4.429268292682927, - "grad_norm": 5.194574356079102, - "learning_rate": 2.9496229721772734e-06, - "loss": 0.1854, - "step": 908 - }, - { - "epoch": 4.434146341463415, - "grad_norm": 3.520110845565796, - "learning_rate": 2.9458537061000435e-06, - "loss": 0.1785, - "step": 909 - }, - { - "epoch": 4.439024390243903, - "grad_norm": 3.417991876602173, - "learning_rate": 2.9420833929320726e-06, - "loss": 0.1603, - "step": 910 - }, - { - "epoch": 4.443902439024391, - "grad_norm": 5.225805282592773, - "learning_rate": 2.93831204152797e-06, - "loss": 0.3046, - "step": 911 - }, - { - "epoch": 4.4487804878048784, - "grad_norm": 3.541433572769165, - "learning_rate": 2.9345396607447807e-06, - "loss": 0.0631, - "step": 912 - }, - { - "epoch": 4.453658536585366, - "grad_norm": 3.909377098083496, - "learning_rate": 2.9307662594419704e-06, - "loss": 0.125, - "step": 913 - }, - { - "epoch": 4.458536585365854, - "grad_norm": 3.6604416370391846, - "learning_rate": 2.9269918464814e-06, - "loss": 0.156, - "step": 914 - }, - { - "epoch": 4.463414634146342, - "grad_norm": 3.7413833141326904, - "learning_rate": 2.923216430727306e-06, - "loss": 0.3334, - "step": 915 - }, - { - "epoch": 4.46829268292683, - "grad_norm": 3.531996011734009, - "learning_rate": 2.9194400210462808e-06, - "loss": 0.2534, - "step": 916 - }, - { - "epoch": 4.473170731707317, - "grad_norm": 4.163621425628662, - "learning_rate": 2.91566262630725e-06, - "loss": 0.352, - "step": 917 - }, - { - "epoch": 4.478048780487805, - "grad_norm": 3.923635482788086, - "learning_rate": 2.9118842553814526e-06, - "loss": 0.1132, - "step": 918 - }, - { - "epoch": 4.482926829268292, - "grad_norm": 2.833768844604492, - "learning_rate": 2.9081049171424223e-06, - "loss": 0.086, - "step": 919 - }, - { - "epoch": 4.487804878048781, - "grad_norm": 2.9006292819976807, - "learning_rate": 2.9043246204659624e-06, - "loss": 0.0693, - "step": 920 - }, - { - "epoch": 4.492682926829268, - "grad_norm": 3.699376344680786, - "learning_rate": 2.9005433742301274e-06, - "loss": 0.2463, - "step": 921 - }, - { - "epoch": 4.4975609756097565, - "grad_norm": 4.882141590118408, - "learning_rate": 2.8967611873152037e-06, - "loss": 0.2275, - "step": 922 - }, - { - "epoch": 4.5024390243902435, - "grad_norm": 3.0554678440093994, - "learning_rate": 2.892978068603683e-06, - "loss": 0.0752, - "step": 923 - }, - { - "epoch": 4.507317073170732, - "grad_norm": 3.1225268840789795, - "learning_rate": 2.889194026980249e-06, - "loss": 0.1649, - "step": 924 - }, - { - "epoch": 4.512195121951219, - "grad_norm": 17.75234031677246, - "learning_rate": 2.8854090713317514e-06, - "loss": 0.0437, - "step": 925 - }, - { - "epoch": 4.517073170731708, - "grad_norm": 3.011223554611206, - "learning_rate": 2.8816232105471864e-06, - "loss": 0.0747, - "step": 926 - }, - { - "epoch": 4.521951219512195, - "grad_norm": 4.327573299407959, - "learning_rate": 2.877836453517677e-06, - "loss": 0.3884, - "step": 927 - }, - { - "epoch": 4.526829268292683, - "grad_norm": 3.8694965839385986, - "learning_rate": 2.8740488091364492e-06, - "loss": 0.2741, - "step": 928 - }, - { - "epoch": 4.53170731707317, - "grad_norm": 5.375877380371094, - "learning_rate": 2.870260286298814e-06, - "loss": 0.364, - "step": 929 - }, - { - "epoch": 4.536585365853659, - "grad_norm": 3.380891799926758, - "learning_rate": 2.866470893902147e-06, - "loss": 0.1495, - "step": 930 - }, - { - "epoch": 4.541463414634146, - "grad_norm": 3.723992109298706, - "learning_rate": 2.8626806408458626e-06, - "loss": 0.1403, - "step": 931 - }, - { - "epoch": 4.546341463414635, - "grad_norm": 3.0534417629241943, - "learning_rate": 2.8588895360313983e-06, - "loss": 0.0946, - "step": 932 - }, - { - "epoch": 4.5512195121951216, - "grad_norm": 2.8875234127044678, - "learning_rate": 2.8550975883621935e-06, - "loss": 0.1851, - "step": 933 - }, - { - "epoch": 4.55609756097561, - "grad_norm": 3.532166004180908, - "learning_rate": 2.8513048067436644e-06, - "loss": 0.178, - "step": 934 - }, - { - "epoch": 4.560975609756097, - "grad_norm": 2.942798376083374, - "learning_rate": 2.847511200083187e-06, - "loss": 0.1131, - "step": 935 - }, - { - "epoch": 4.565853658536585, - "grad_norm": 2.926874876022339, - "learning_rate": 2.843716777290074e-06, - "loss": 0.1251, - "step": 936 - }, - { - "epoch": 4.570731707317073, - "grad_norm": 3.525895357131958, - "learning_rate": 2.839921547275556e-06, - "loss": 0.0946, - "step": 937 - }, - { - "epoch": 4.575609756097561, - "grad_norm": 3.7033681869506836, - "learning_rate": 2.836125518952759e-06, - "loss": 0.1529, - "step": 938 - }, - { - "epoch": 4.580487804878048, - "grad_norm": 3.235154867172241, - "learning_rate": 2.8323287012366845e-06, - "loss": 0.2511, - "step": 939 - }, - { - "epoch": 4.585365853658536, - "grad_norm": 3.5275583267211914, - "learning_rate": 2.828531103044186e-06, - "loss": 0.1474, - "step": 940 - }, - { - "epoch": 4.590243902439024, - "grad_norm": 3.1356353759765625, - "learning_rate": 2.8247327332939512e-06, - "loss": 0.2249, - "step": 941 - }, - { - "epoch": 4.595121951219512, - "grad_norm": 3.789210081100464, - "learning_rate": 2.82093360090648e-06, - "loss": 0.2258, - "step": 942 - }, - { - "epoch": 4.6, - "grad_norm": 4.841623306274414, - "learning_rate": 2.8171337148040636e-06, - "loss": 0.2235, - "step": 943 - }, - { - "epoch": 4.6048780487804875, - "grad_norm": 3.161630630493164, - "learning_rate": 2.813333083910761e-06, - "loss": 0.1562, - "step": 944 - }, - { - "epoch": 4.609756097560975, - "grad_norm": 2.8718132972717285, - "learning_rate": 2.8095317171523835e-06, - "loss": 0.0625, - "step": 945 - }, - { - "epoch": 4.614634146341463, - "grad_norm": 3.6432454586029053, - "learning_rate": 2.805729623456469e-06, - "loss": 0.2205, - "step": 946 - }, - { - "epoch": 4.619512195121951, - "grad_norm": 4.382034778594971, - "learning_rate": 2.8019268117522624e-06, - "loss": 0.3241, - "step": 947 - }, - { - "epoch": 4.624390243902439, - "grad_norm": 3.2998175621032715, - "learning_rate": 2.798123290970695e-06, - "loss": 0.1983, - "step": 948 - }, - { - "epoch": 4.6292682926829265, - "grad_norm": 3.8665990829467773, - "learning_rate": 2.794319070044365e-06, - "loss": 0.3391, - "step": 949 - }, - { - "epoch": 4.634146341463414, - "grad_norm": 3.628403425216675, - "learning_rate": 2.790514157907512e-06, - "loss": 0.1329, - "step": 950 - }, - { - "epoch": 4.639024390243902, - "grad_norm": 2.8889615535736084, - "learning_rate": 2.786708563496002e-06, - "loss": 0.141, - "step": 951 - }, - { - "epoch": 4.64390243902439, - "grad_norm": 4.07351541519165, - "learning_rate": 2.782902295747299e-06, - "loss": 0.2935, - "step": 952 - }, - { - "epoch": 4.648780487804878, - "grad_norm": 4.220067024230957, - "learning_rate": 2.7790953636004536e-06, - "loss": 0.318, - "step": 953 - }, - { - "epoch": 4.6536585365853655, - "grad_norm": 3.8444325923919678, - "learning_rate": 2.775287775996074e-06, - "loss": 0.3388, - "step": 954 - }, - { - "epoch": 4.658536585365853, - "grad_norm": 3.197313070297241, - "learning_rate": 2.7714795418763067e-06, - "loss": 0.0925, - "step": 955 - }, - { - "epoch": 4.663414634146341, - "grad_norm": 4.0050811767578125, - "learning_rate": 2.7676706701848187e-06, - "loss": 0.2811, - "step": 956 - }, - { - "epoch": 4.668292682926829, - "grad_norm": 3.217160224914551, - "learning_rate": 2.763861169866774e-06, - "loss": 0.311, - "step": 957 - }, - { - "epoch": 4.673170731707317, - "grad_norm": 2.9892494678497314, - "learning_rate": 2.7600510498688104e-06, - "loss": 0.0582, - "step": 958 - }, - { - "epoch": 4.678048780487805, - "grad_norm": 3.954805374145508, - "learning_rate": 2.7562403191390246e-06, - "loss": 0.1238, - "step": 959 - }, - { - "epoch": 4.682926829268292, - "grad_norm": 2.9582695960998535, - "learning_rate": 2.7524289866269467e-06, - "loss": 0.1243, - "step": 960 - }, - { - "epoch": 4.68780487804878, - "grad_norm": 2.807002544403076, - "learning_rate": 2.748617061283518e-06, - "loss": 0.1388, - "step": 961 - }, - { - "epoch": 4.692682926829268, - "grad_norm": 3.980499505996704, - "learning_rate": 2.744804552061074e-06, - "loss": 0.1144, - "step": 962 - }, - { - "epoch": 4.697560975609756, - "grad_norm": 3.6389007568359375, - "learning_rate": 2.740991467913321e-06, - "loss": 0.2155, - "step": 963 - }, - { - "epoch": 4.702439024390244, - "grad_norm": 3.0950801372528076, - "learning_rate": 2.737177817795315e-06, - "loss": 0.0983, - "step": 964 - }, - { - "epoch": 4.7073170731707314, - "grad_norm": 3.1723053455352783, - "learning_rate": 2.7333636106634414e-06, - "loss": 0.1365, - "step": 965 - }, - { - "epoch": 4.712195121951219, - "grad_norm": 3.83921217918396, - "learning_rate": 2.7295488554753957e-06, - "loss": 0.1977, - "step": 966 - }, - { - "epoch": 4.717073170731707, - "grad_norm": 3.348057746887207, - "learning_rate": 2.725733561190157e-06, - "loss": 0.1311, - "step": 967 - }, - { - "epoch": 4.721951219512195, - "grad_norm": 3.828483819961548, - "learning_rate": 2.721917736767973e-06, - "loss": 0.2464, - "step": 968 - }, - { - "epoch": 4.726829268292683, - "grad_norm": 2.6004624366760254, - "learning_rate": 2.7181013911703357e-06, - "loss": 0.1088, - "step": 969 - }, - { - "epoch": 4.7317073170731705, - "grad_norm": 3.316990852355957, - "learning_rate": 2.714284533359961e-06, - "loss": 0.1492, - "step": 970 - }, - { - "epoch": 4.736585365853658, - "grad_norm": 3.8770010471343994, - "learning_rate": 2.710467172300768e-06, - "loss": 0.218, - "step": 971 - }, - { - "epoch": 4.741463414634146, - "grad_norm": 4.456376552581787, - "learning_rate": 2.706649316957857e-06, - "loss": 0.2199, - "step": 972 - }, - { - "epoch": 4.746341463414634, - "grad_norm": 3.3376309871673584, - "learning_rate": 2.7028309762974897e-06, - "loss": 0.0595, - "step": 973 - }, - { - "epoch": 4.751219512195122, - "grad_norm": 3.6755495071411133, - "learning_rate": 2.699012159287069e-06, - "loss": 0.1653, - "step": 974 - }, - { - "epoch": 4.7560975609756095, - "grad_norm": 2.939887046813965, - "learning_rate": 2.6951928748951125e-06, - "loss": 0.0681, - "step": 975 - }, - { - "epoch": 4.760975609756097, - "grad_norm": 3.4101195335388184, - "learning_rate": 2.69137313209124e-06, - "loss": 0.2046, - "step": 976 - }, - { - "epoch": 4.765853658536585, - "grad_norm": 3.9811208248138428, - "learning_rate": 2.687552939846145e-06, - "loss": 0.2255, - "step": 977 - }, - { - "epoch": 4.770731707317073, - "grad_norm": 3.484255313873291, - "learning_rate": 2.6837323071315766e-06, - "loss": 0.0512, - "step": 978 - }, - { - "epoch": 4.775609756097561, - "grad_norm": 3.9005143642425537, - "learning_rate": 2.679911242920321e-06, - "loss": 0.162, - "step": 979 - }, - { - "epoch": 4.780487804878049, - "grad_norm": 4.933374881744385, - "learning_rate": 2.6760897561861742e-06, - "loss": 0.398, - "step": 980 - }, - { - "epoch": 4.785365853658536, - "grad_norm": 3.0741539001464844, - "learning_rate": 2.672267855903927e-06, - "loss": 0.0507, - "step": 981 - }, - { - "epoch": 4.790243902439024, - "grad_norm": 3.023772716522217, - "learning_rate": 2.6684455510493413e-06, - "loss": 0.2066, - "step": 982 - }, - { - "epoch": 4.795121951219512, - "grad_norm": 3.0102407932281494, - "learning_rate": 2.6646228505991267e-06, - "loss": 0.2296, - "step": 983 - }, - { - "epoch": 4.8, - "grad_norm": 3.902200222015381, - "learning_rate": 2.6607997635309246e-06, - "loss": 0.14, - "step": 984 - }, - { - "epoch": 4.804878048780488, - "grad_norm": 3.836185932159424, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.1583, - "step": 985 - }, - { - "epoch": 4.809756097560975, - "grad_norm": 3.539628744125366, - "learning_rate": 2.653152465455639e-06, - "loss": 0.2619, - "step": 986 - }, - { - "epoch": 4.814634146341463, - "grad_norm": 4.716914653778076, - "learning_rate": 2.6493282724082913e-06, - "loss": 0.3029, - "step": 987 - }, - { - "epoch": 4.819512195121951, - "grad_norm": 3.466914176940918, - "learning_rate": 2.6455037286623864e-06, - "loss": 0.095, - "step": 988 - }, - { - "epoch": 4.824390243902439, - "grad_norm": 2.1798667907714844, - "learning_rate": 2.6416788431998935e-06, - "loss": 0.1232, - "step": 989 - }, - { - "epoch": 4.829268292682927, - "grad_norm": 3.309039354324341, - "learning_rate": 2.637853625003585e-06, - "loss": 0.3671, - "step": 990 - }, - { - "epoch": 4.8341463414634145, - "grad_norm": 3.2619435787200928, - "learning_rate": 2.6340280830570142e-06, - "loss": 0.194, - "step": 991 - }, - { - "epoch": 4.839024390243902, - "grad_norm": 3.601161003112793, - "learning_rate": 2.6302022263444947e-06, - "loss": 0.1214, - "step": 992 - }, - { - "epoch": 4.84390243902439, - "grad_norm": 4.13787841796875, - "learning_rate": 2.6263760638510793e-06, - "loss": 0.311, - "step": 993 - }, - { - "epoch": 4.848780487804878, - "grad_norm": 3.0474166870117188, - "learning_rate": 2.6225496045625394e-06, - "loss": 0.1853, - "step": 994 - }, - { - "epoch": 4.853658536585366, - "grad_norm": 4.481237411499023, - "learning_rate": 2.6187228574653428e-06, - "loss": 0.2088, - "step": 995 - }, - { - "epoch": 4.8585365853658535, - "grad_norm": 3.235966444015503, - "learning_rate": 2.614895831546633e-06, - "loss": 0.1439, - "step": 996 - }, - { - "epoch": 4.863414634146341, - "grad_norm": 4.103270053863525, - "learning_rate": 2.6110685357942096e-06, - "loss": 0.2823, - "step": 997 - }, - { - "epoch": 4.868292682926829, - "grad_norm": 4.134536266326904, - "learning_rate": 2.6072409791965048e-06, - "loss": 0.2963, - "step": 998 - }, - { - "epoch": 4.873170731707317, - "grad_norm": 4.124892711639404, - "learning_rate": 2.6034131707425638e-06, - "loss": 0.4127, - "step": 999 - }, - { - "epoch": 4.878048780487805, - "grad_norm": 3.565139055252075, - "learning_rate": 2.5995851194220223e-06, - "loss": 0.1601, - "step": 1000 - }, - { - "epoch": 4.882926829268293, - "grad_norm": 2.7548017501831055, - "learning_rate": 2.595756834225089e-06, - "loss": 0.161, - "step": 1001 - }, - { - "epoch": 4.88780487804878, - "grad_norm": 3.9297611713409424, - "learning_rate": 2.5919283241425188e-06, - "loss": 0.1013, - "step": 1002 - }, - { - "epoch": 4.892682926829268, - "grad_norm": 2.4904236793518066, - "learning_rate": 2.5880995981655965e-06, - "loss": 0.1177, - "step": 1003 - }, - { - "epoch": 4.897560975609756, - "grad_norm": 3.513308048248291, - "learning_rate": 2.584270665286113e-06, - "loss": 0.0682, - "step": 1004 - }, - { - "epoch": 4.902439024390244, - "grad_norm": 4.221067428588867, - "learning_rate": 2.580441534496346e-06, - "loss": 0.1502, - "step": 1005 - }, - { - "epoch": 4.907317073170732, - "grad_norm": 3.4298903942108154, - "learning_rate": 2.576612214789039e-06, - "loss": 0.1772, - "step": 1006 - }, - { - "epoch": 4.912195121951219, - "grad_norm": 4.402887344360352, - "learning_rate": 2.5727827151573747e-06, - "loss": 0.2029, - "step": 1007 - }, - { - "epoch": 4.917073170731707, - "grad_norm": 4.194999694824219, - "learning_rate": 2.568953044594964e-06, - "loss": 0.1269, - "step": 1008 - }, - { - "epoch": 4.921951219512195, - "grad_norm": 3.657607078552246, - "learning_rate": 2.5651232120958157e-06, - "loss": 0.1311, - "step": 1009 - }, - { - "epoch": 4.926829268292683, - "grad_norm": 4.092184543609619, - "learning_rate": 2.56129322665432e-06, - "loss": 0.1085, - "step": 1010 - }, - { - "epoch": 4.931707317073171, - "grad_norm": 3.3648242950439453, - "learning_rate": 2.5574630972652263e-06, - "loss": 0.0782, - "step": 1011 - }, - { - "epoch": 4.9365853658536585, - "grad_norm": 3.7215166091918945, - "learning_rate": 2.553632832923622e-06, - "loss": 0.1391, - "step": 1012 - }, - { - "epoch": 4.941463414634146, - "grad_norm": 4.045740127563477, - "learning_rate": 2.5498024426249107e-06, - "loss": 0.3141, - "step": 1013 - }, - { - "epoch": 4.946341463414634, - "grad_norm": 3.2363107204437256, - "learning_rate": 2.545971935364794e-06, - "loss": 0.0679, - "step": 1014 - }, - { - "epoch": 4.951219512195122, - "grad_norm": 3.057283639907837, - "learning_rate": 2.5421413201392443e-06, - "loss": 0.1382, - "step": 1015 - }, - { - "epoch": 4.95609756097561, - "grad_norm": 3.591535806655884, - "learning_rate": 2.538310605944491e-06, - "loss": 0.112, - "step": 1016 - }, - { - "epoch": 4.9609756097560975, - "grad_norm": 3.1629281044006348, - "learning_rate": 2.534479801776996e-06, - "loss": 0.1261, - "step": 1017 - }, - { - "epoch": 4.965853658536585, - "grad_norm": 2.691740036010742, - "learning_rate": 2.53064891663343e-06, - "loss": 0.2328, - "step": 1018 - }, - { - "epoch": 4.970731707317073, - "grad_norm": 3.2620503902435303, - "learning_rate": 2.526817959510655e-06, - "loss": 0.193, - "step": 1019 - }, - { - "epoch": 4.975609756097561, - "grad_norm": 3.0721535682678223, - "learning_rate": 2.5229869394057038e-06, - "loss": 0.2444, - "step": 1020 - }, - { - "epoch": 4.980487804878049, - "grad_norm": 2.6279208660125732, - "learning_rate": 2.5191558653157542e-06, - "loss": 0.1103, - "step": 1021 - }, - { - "epoch": 4.985365853658537, - "grad_norm": 2.9295670986175537, - "learning_rate": 2.515324746238113e-06, - "loss": 0.0553, - "step": 1022 - }, - { - "epoch": 4.990243902439024, - "grad_norm": 3.3960084915161133, - "learning_rate": 2.511493591170191e-06, - "loss": 0.1686, - "step": 1023 - }, - { - "epoch": 4.995121951219512, - "grad_norm": 4.138705253601074, - "learning_rate": 2.5076624091094846e-06, - "loss": 0.1208, - "step": 1024 - }, - { - "epoch": 5.0, - "grad_norm": 2.603870391845703, - "learning_rate": 2.503831209053554e-06, - "loss": 0.1216, - "step": 1025 - }, - { - "epoch": 5.004878048780488, - "grad_norm": 2.525205612182617, - "learning_rate": 2.5e-06, - "loss": 0.0984, - "step": 1026 - }, - { - "epoch": 5.009756097560976, - "grad_norm": 3.2502501010894775, - "learning_rate": 2.4961687909464462e-06, - "loss": 0.1323, - "step": 1027 - }, - { - "epoch": 5.014634146341463, - "grad_norm": 5.363409519195557, - "learning_rate": 2.492337590890516e-06, - "loss": 0.3516, - "step": 1028 - }, - { - "epoch": 5.019512195121951, - "grad_norm": 2.887723445892334, - "learning_rate": 2.4885064088298097e-06, - "loss": 0.1931, - "step": 1029 - }, - { - "epoch": 5.024390243902439, - "grad_norm": 3.4529435634613037, - "learning_rate": 2.4846752537618875e-06, - "loss": 0.0675, - "step": 1030 - }, - { - "epoch": 5.029268292682927, - "grad_norm": 4.202361106872559, - "learning_rate": 2.480844134684246e-06, - "loss": 0.1643, - "step": 1031 - }, - { - "epoch": 5.034146341463415, - "grad_norm": 2.910275459289551, - "learning_rate": 2.4770130605942966e-06, - "loss": 0.11, - "step": 1032 - }, - { - "epoch": 5.0390243902439025, - "grad_norm": 3.5430362224578857, - "learning_rate": 2.4731820404893457e-06, - "loss": 0.0614, - "step": 1033 - }, - { - "epoch": 5.04390243902439, - "grad_norm": 4.501879692077637, - "learning_rate": 2.469351083366571e-06, - "loss": 0.0954, - "step": 1034 - }, - { - "epoch": 5.048780487804878, - "grad_norm": 2.732261896133423, - "learning_rate": 2.4655201982230044e-06, - "loss": 0.0275, - "step": 1035 - }, - { - "epoch": 5.053658536585366, - "grad_norm": 3.5926437377929688, - "learning_rate": 2.4616893940555094e-06, - "loss": 0.0661, - "step": 1036 - }, - { - "epoch": 5.058536585365854, - "grad_norm": 4.790312767028809, - "learning_rate": 2.457858679860757e-06, - "loss": 0.2976, - "step": 1037 - }, - { - "epoch": 5.0634146341463415, - "grad_norm": 4.453246116638184, - "learning_rate": 2.4540280646352072e-06, - "loss": 0.1216, - "step": 1038 - }, - { - "epoch": 5.068292682926829, - "grad_norm": 3.288011074066162, - "learning_rate": 2.45019755737509e-06, - "loss": 0.0877, - "step": 1039 - }, - { - "epoch": 5.073170731707317, - "grad_norm": 3.566927671432495, - "learning_rate": 2.4463671670763787e-06, - "loss": 0.1661, - "step": 1040 - }, - { - "epoch": 5.078048780487805, - "grad_norm": 3.250047206878662, - "learning_rate": 2.4425369027347746e-06, - "loss": 0.211, - "step": 1041 - }, - { - "epoch": 5.082926829268293, - "grad_norm": 3.0214977264404297, - "learning_rate": 2.4387067733456804e-06, - "loss": 0.093, - "step": 1042 - }, - { - "epoch": 5.087804878048781, - "grad_norm": 3.8162097930908203, - "learning_rate": 2.4348767879041847e-06, - "loss": 0.0777, - "step": 1043 - }, - { - "epoch": 5.092682926829268, - "grad_norm": 3.8071560859680176, - "learning_rate": 2.4310469554050366e-06, - "loss": 0.087, - "step": 1044 - }, - { - "epoch": 5.097560975609756, - "grad_norm": 3.1032073497772217, - "learning_rate": 2.4272172848426257e-06, - "loss": 0.1105, - "step": 1045 - }, - { - "epoch": 5.102439024390244, - "grad_norm": 2.8980185985565186, - "learning_rate": 2.423387785210962e-06, - "loss": 0.0704, - "step": 1046 - }, - { - "epoch": 5.107317073170732, - "grad_norm": 3.9110755920410156, - "learning_rate": 2.4195584655036544e-06, - "loss": 0.2118, - "step": 1047 - }, - { - "epoch": 5.11219512195122, - "grad_norm": 2.678884506225586, - "learning_rate": 2.4157293347138877e-06, - "loss": 0.0664, - "step": 1048 - }, - { - "epoch": 5.117073170731707, - "grad_norm": 3.183046340942383, - "learning_rate": 2.4119004018344043e-06, - "loss": 0.1767, - "step": 1049 - }, - { - "epoch": 5.121951219512195, - "grad_norm": 3.9198925495147705, - "learning_rate": 2.408071675857482e-06, - "loss": 0.1288, - "step": 1050 - }, - { - "epoch": 5.126829268292683, - "grad_norm": 4.378621578216553, - "learning_rate": 2.404243165774912e-06, - "loss": 0.1724, - "step": 1051 - }, - { - "epoch": 5.131707317073171, - "grad_norm": 2.5509133338928223, - "learning_rate": 2.4004148805779785e-06, - "loss": 0.0382, - "step": 1052 - }, - { - "epoch": 5.136585365853659, - "grad_norm": 3.692396402359009, - "learning_rate": 2.3965868292574375e-06, - "loss": 0.0942, - "step": 1053 - }, - { - "epoch": 5.1414634146341465, - "grad_norm": 3.8537800312042236, - "learning_rate": 2.392759020803496e-06, - "loss": 0.0819, - "step": 1054 - }, - { - "epoch": 5.146341463414634, - "grad_norm": 4.02876091003418, - "learning_rate": 2.3889314642057916e-06, - "loss": 0.0866, - "step": 1055 - }, - { - "epoch": 5.151219512195122, - "grad_norm": 3.531857490539551, - "learning_rate": 2.3851041684533677e-06, - "loss": 0.1557, - "step": 1056 - }, - { - "epoch": 5.15609756097561, - "grad_norm": 2.231265068054199, - "learning_rate": 2.381277142534658e-06, - "loss": 0.0421, - "step": 1057 - }, - { - "epoch": 5.160975609756098, - "grad_norm": 3.159226894378662, - "learning_rate": 2.3774503954374614e-06, - "loss": 0.0395, - "step": 1058 - }, - { - "epoch": 5.1658536585365855, - "grad_norm": 3.0375123023986816, - "learning_rate": 2.373623936148921e-06, - "loss": 0.1869, - "step": 1059 - }, - { - "epoch": 5.170731707317073, - "grad_norm": 5.4905900955200195, - "learning_rate": 2.369797773655506e-06, - "loss": 0.1426, - "step": 1060 - }, - { - "epoch": 5.175609756097561, - "grad_norm": 2.8739638328552246, - "learning_rate": 2.3659719169429866e-06, - "loss": 0.0788, - "step": 1061 - }, - { - "epoch": 5.180487804878049, - "grad_norm": 2.612183094024658, - "learning_rate": 2.3621463749964153e-06, - "loss": 0.0449, - "step": 1062 - }, - { - "epoch": 5.185365853658537, - "grad_norm": 2.0573198795318604, - "learning_rate": 2.3583211568001073e-06, - "loss": 0.0264, - "step": 1063 - }, - { - "epoch": 5.190243902439025, - "grad_norm": 2.3667244911193848, - "learning_rate": 2.3544962713376144e-06, - "loss": 0.0507, - "step": 1064 - }, - { - "epoch": 5.195121951219512, - "grad_norm": 2.1223740577697754, - "learning_rate": 2.3506717275917095e-06, - "loss": 0.0576, - "step": 1065 - }, - { - "epoch": 5.2, - "grad_norm": 2.2630319595336914, - "learning_rate": 2.346847534544362e-06, - "loss": 0.0523, - "step": 1066 - }, - { - "epoch": 5.204878048780488, - "grad_norm": 3.201913595199585, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.0847, - "step": 1067 - }, - { - "epoch": 5.209756097560976, - "grad_norm": 2.2149481773376465, - "learning_rate": 2.3392002364690762e-06, - "loss": 0.0215, - "step": 1068 - }, - { - "epoch": 5.214634146341464, - "grad_norm": 4.425244331359863, - "learning_rate": 2.335377149400874e-06, - "loss": 0.1018, - "step": 1069 - }, - { - "epoch": 5.219512195121951, - "grad_norm": 4.548358917236328, - "learning_rate": 2.3315544489506596e-06, - "loss": 0.1485, - "step": 1070 - }, - { - "epoch": 5.224390243902439, - "grad_norm": 3.635796546936035, - "learning_rate": 2.3277321440960733e-06, - "loss": 0.111, - "step": 1071 - }, - { - "epoch": 5.229268292682927, - "grad_norm": 2.3180043697357178, - "learning_rate": 2.323910243813826e-06, - "loss": 0.0267, - "step": 1072 - }, - { - "epoch": 5.234146341463415, - "grad_norm": 3.675490379333496, - "learning_rate": 2.3200887570796798e-06, - "loss": 0.153, - "step": 1073 - }, - { - "epoch": 5.239024390243903, - "grad_norm": 2.883225202560425, - "learning_rate": 2.316267692868424e-06, - "loss": 0.0968, - "step": 1074 - }, - { - "epoch": 5.2439024390243905, - "grad_norm": 3.0320188999176025, - "learning_rate": 2.312447060153856e-06, - "loss": 0.0786, - "step": 1075 - }, - { - "epoch": 5.248780487804878, - "grad_norm": 2.682695150375366, - "learning_rate": 2.308626867908761e-06, - "loss": 0.0677, - "step": 1076 - }, - { - "epoch": 5.253658536585366, - "grad_norm": 3.941967010498047, - "learning_rate": 2.3048071251048884e-06, - "loss": 0.1059, - "step": 1077 - }, - { - "epoch": 5.258536585365854, - "grad_norm": 6.485599517822266, - "learning_rate": 2.300987840712932e-06, - "loss": 0.1331, - "step": 1078 - }, - { - "epoch": 5.263414634146342, - "grad_norm": 3.809269905090332, - "learning_rate": 2.297169023702511e-06, - "loss": 0.169, - "step": 1079 - }, - { - "epoch": 5.2682926829268295, - "grad_norm": 3.115626573562622, - "learning_rate": 2.2933506830421436e-06, - "loss": 0.1349, - "step": 1080 - }, - { - "epoch": 5.273170731707317, - "grad_norm": 2.2234909534454346, - "learning_rate": 2.2895328276992325e-06, - "loss": 0.0191, - "step": 1081 - }, - { - "epoch": 5.278048780487805, - "grad_norm": 3.896925926208496, - "learning_rate": 2.28571546664004e-06, - "loss": 0.1961, - "step": 1082 - }, - { - "epoch": 5.282926829268293, - "grad_norm": 2.4134509563446045, - "learning_rate": 2.281898608829665e-06, - "loss": 0.02, - "step": 1083 - }, - { - "epoch": 5.287804878048781, - "grad_norm": 2.7599191665649414, - "learning_rate": 2.2780822632320273e-06, - "loss": 0.0763, - "step": 1084 - }, - { - "epoch": 5.2926829268292686, - "grad_norm": 2.465637683868408, - "learning_rate": 2.2742664388098435e-06, - "loss": 0.0403, - "step": 1085 - }, - { - "epoch": 5.297560975609756, - "grad_norm": 2.4026618003845215, - "learning_rate": 2.270451144524605e-06, - "loss": 0.0982, - "step": 1086 - }, - { - "epoch": 5.302439024390244, - "grad_norm": 3.3339459896087646, - "learning_rate": 2.266636389336559e-06, - "loss": 0.09, - "step": 1087 - }, - { - "epoch": 5.307317073170732, - "grad_norm": 2.113255023956299, - "learning_rate": 2.262822182204686e-06, - "loss": 0.0267, - "step": 1088 - }, - { - "epoch": 5.31219512195122, - "grad_norm": 3.1760852336883545, - "learning_rate": 2.2590085320866798e-06, - "loss": 0.0295, - "step": 1089 - }, - { - "epoch": 5.317073170731708, - "grad_norm": 2.9674434661865234, - "learning_rate": 2.255195447938927e-06, - "loss": 0.0261, - "step": 1090 - }, - { - "epoch": 5.321951219512195, - "grad_norm": 3.4384074211120605, - "learning_rate": 2.251382938716482e-06, - "loss": 0.0936, - "step": 1091 - }, - { - "epoch": 5.326829268292683, - "grad_norm": 3.3814568519592285, - "learning_rate": 2.2475710133730533e-06, - "loss": 0.0426, - "step": 1092 - }, - { - "epoch": 5.331707317073171, - "grad_norm": 3.081317663192749, - "learning_rate": 2.243759680860975e-06, - "loss": 0.0799, - "step": 1093 - }, - { - "epoch": 5.336585365853659, - "grad_norm": 3.5608482360839844, - "learning_rate": 2.2399489501311896e-06, - "loss": 0.0906, - "step": 1094 - }, - { - "epoch": 5.341463414634147, - "grad_norm": 3.7886314392089844, - "learning_rate": 2.2361388301332265e-06, - "loss": 0.2152, - "step": 1095 - }, - { - "epoch": 5.3463414634146345, - "grad_norm": 1.9531102180480957, - "learning_rate": 2.2323293298151817e-06, - "loss": 0.0359, - "step": 1096 - }, - { - "epoch": 5.351219512195122, - "grad_norm": 2.2828023433685303, - "learning_rate": 2.2285204581236937e-06, - "loss": 0.0368, - "step": 1097 - }, - { - "epoch": 5.35609756097561, - "grad_norm": 3.110262870788574, - "learning_rate": 2.2247122240039268e-06, - "loss": 0.0426, - "step": 1098 - }, - { - "epoch": 5.360975609756098, - "grad_norm": 2.3293566703796387, - "learning_rate": 2.2209046363995464e-06, - "loss": 0.0223, - "step": 1099 - }, - { - "epoch": 5.365853658536586, - "grad_norm": 2.990884780883789, - "learning_rate": 2.217097704252701e-06, - "loss": 0.1276, - "step": 1100 - }, - { - "epoch": 5.3707317073170735, - "grad_norm": 2.568014144897461, - "learning_rate": 2.2132914365039993e-06, - "loss": 0.0639, - "step": 1101 - }, - { - "epoch": 5.375609756097561, - "grad_norm": 2.618478536605835, - "learning_rate": 2.2094858420924882e-06, - "loss": 0.0166, - "step": 1102 - }, - { - "epoch": 5.380487804878049, - "grad_norm": 4.526919364929199, - "learning_rate": 2.205680929955635e-06, - "loss": 0.144, - "step": 1103 - }, - { - "epoch": 5.385365853658537, - "grad_norm": 2.7236886024475098, - "learning_rate": 2.201876709029305e-06, - "loss": 0.1004, - "step": 1104 - }, - { - "epoch": 5.390243902439025, - "grad_norm": 2.1577632427215576, - "learning_rate": 2.198073188247738e-06, - "loss": 0.0453, - "step": 1105 - }, - { - "epoch": 5.3951219512195125, - "grad_norm": 2.5170321464538574, - "learning_rate": 2.1942703765435317e-06, - "loss": 0.0195, - "step": 1106 - }, - { - "epoch": 5.4, - "grad_norm": 3.962658643722534, - "learning_rate": 2.190468282847617e-06, - "loss": 0.1512, - "step": 1107 - }, - { - "epoch": 5.404878048780488, - "grad_norm": 4.297860622406006, - "learning_rate": 2.186666916089239e-06, - "loss": 0.2572, - "step": 1108 - }, - { - "epoch": 5.409756097560976, - "grad_norm": 2.8933565616607666, - "learning_rate": 2.1828662851959377e-06, - "loss": 0.0536, - "step": 1109 - }, - { - "epoch": 5.414634146341464, - "grad_norm": 2.9397451877593994, - "learning_rate": 2.1790663990935203e-06, - "loss": 0.0778, - "step": 1110 - }, - { - "epoch": 5.419512195121952, - "grad_norm": 3.5210094451904297, - "learning_rate": 2.1752672667060488e-06, - "loss": 0.0558, - "step": 1111 - }, - { - "epoch": 5.424390243902439, - "grad_norm": 2.9027626514434814, - "learning_rate": 2.1714688969558146e-06, - "loss": 0.041, - "step": 1112 - }, - { - "epoch": 5.429268292682927, - "grad_norm": 3.7691168785095215, - "learning_rate": 2.167671298763316e-06, - "loss": 0.1644, - "step": 1113 - }, - { - "epoch": 5.434146341463415, - "grad_norm": 3.493008852005005, - "learning_rate": 2.1638744810472414e-06, - "loss": 0.1587, - "step": 1114 - }, - { - "epoch": 5.439024390243903, - "grad_norm": 2.711196184158325, - "learning_rate": 2.1600784527244445e-06, - "loss": 0.0605, - "step": 1115 - }, - { - "epoch": 5.443902439024391, - "grad_norm": 4.365038871765137, - "learning_rate": 2.1562832227099266e-06, - "loss": 0.1897, - "step": 1116 - }, - { - "epoch": 5.4487804878048784, - "grad_norm": 4.621466159820557, - "learning_rate": 2.152488799916814e-06, - "loss": 0.1525, - "step": 1117 - }, - { - "epoch": 5.453658536585366, - "grad_norm": 4.8721089363098145, - "learning_rate": 2.148695193256336e-06, - "loss": 0.189, - "step": 1118 - }, - { - "epoch": 5.458536585365854, - "grad_norm": 2.8999173641204834, - "learning_rate": 2.1449024116378064e-06, - "loss": 0.095, - "step": 1119 - }, - { - "epoch": 5.463414634146342, - "grad_norm": 2.4865314960479736, - "learning_rate": 2.1411104639686013e-06, - "loss": 0.0432, - "step": 1120 - }, - { - "epoch": 5.46829268292683, - "grad_norm": 3.8497228622436523, - "learning_rate": 2.137319359154138e-06, - "loss": 0.0954, - "step": 1121 - }, - { - "epoch": 5.473170731707317, - "grad_norm": 2.3643507957458496, - "learning_rate": 2.133529106097853e-06, - "loss": 0.0362, - "step": 1122 - }, - { - "epoch": 5.478048780487805, - "grad_norm": 3.017826795578003, - "learning_rate": 2.1297397137011862e-06, - "loss": 0.0875, - "step": 1123 - }, - { - "epoch": 5.482926829268292, - "grad_norm": 3.239320755004883, - "learning_rate": 2.125951190863551e-06, - "loss": 0.0758, - "step": 1124 - }, - { - "epoch": 5.487804878048781, - "grad_norm": 2.566241979598999, - "learning_rate": 2.1221635464823237e-06, - "loss": 0.0605, - "step": 1125 - }, - { - "epoch": 5.492682926829268, - "grad_norm": 4.810088157653809, - "learning_rate": 2.1183767894528135e-06, - "loss": 0.2403, - "step": 1126 - }, - { - "epoch": 5.4975609756097565, - "grad_norm": 2.083263397216797, - "learning_rate": 2.114590928668249e-06, - "loss": 0.0223, - "step": 1127 - }, - { - "epoch": 5.5024390243902435, - "grad_norm": 2.6812374591827393, - "learning_rate": 2.1108059730197517e-06, - "loss": 0.0617, - "step": 1128 - }, - { - "epoch": 5.507317073170732, - "grad_norm": 3.196735143661499, - "learning_rate": 2.1070219313963173e-06, - "loss": 0.043, - "step": 1129 - }, - { - "epoch": 5.512195121951219, - "grad_norm": 2.775470495223999, - "learning_rate": 2.1032388126847967e-06, - "loss": 0.0595, - "step": 1130 - }, - { - "epoch": 5.517073170731708, - "grad_norm": 2.8632407188415527, - "learning_rate": 2.099456625769872e-06, - "loss": 0.0186, - "step": 1131 - }, - { - "epoch": 5.521951219512195, - "grad_norm": 4.075018405914307, - "learning_rate": 2.0956753795340376e-06, - "loss": 0.0616, - "step": 1132 - }, - { - "epoch": 5.526829268292683, - "grad_norm": 3.206327199935913, - "learning_rate": 2.091895082857578e-06, - "loss": 0.1895, - "step": 1133 - }, - { - "epoch": 5.53170731707317, - "grad_norm": 2.967588186264038, - "learning_rate": 2.0881157446185474e-06, - "loss": 0.0484, - "step": 1134 - }, - { - "epoch": 5.536585365853659, - "grad_norm": 2.850929021835327, - "learning_rate": 2.0843373736927506e-06, - "loss": 0.037, - "step": 1135 - }, - { - "epoch": 5.541463414634146, - "grad_norm": 2.2505147457122803, - "learning_rate": 2.08055997895372e-06, - "loss": 0.0227, - "step": 1136 - }, - { - "epoch": 5.546341463414635, - "grad_norm": 2.5258476734161377, - "learning_rate": 2.0767835692726944e-06, - "loss": 0.0296, - "step": 1137 - }, - { - "epoch": 5.5512195121951216, - "grad_norm": 3.498741388320923, - "learning_rate": 2.0730081535186e-06, - "loss": 0.16, - "step": 1138 - }, - { - "epoch": 5.55609756097561, - "grad_norm": 2.8635222911834717, - "learning_rate": 2.06923374055803e-06, - "loss": 0.0725, - "step": 1139 - }, - { - "epoch": 5.560975609756097, - "grad_norm": 2.2779290676116943, - "learning_rate": 2.0654603392552193e-06, - "loss": 0.0198, - "step": 1140 - }, - { - "epoch": 5.565853658536585, - "grad_norm": 3.1651058197021484, - "learning_rate": 2.0616879584720305e-06, - "loss": 0.1144, - "step": 1141 - }, - { - "epoch": 5.570731707317073, - "grad_norm": 2.4238595962524414, - "learning_rate": 2.057916607067928e-06, - "loss": 0.0491, - "step": 1142 - }, - { - "epoch": 5.575609756097561, - "grad_norm": 2.3248515129089355, - "learning_rate": 2.054146293899957e-06, - "loss": 0.035, - "step": 1143 - }, - { - "epoch": 5.580487804878048, - "grad_norm": 2.9506516456604004, - "learning_rate": 2.0503770278227274e-06, - "loss": 0.0639, - "step": 1144 - }, - { - "epoch": 5.585365853658536, - "grad_norm": 2.6403958797454834, - "learning_rate": 2.0466088176883876e-06, - "loss": 0.0258, - "step": 1145 - }, - { - "epoch": 5.590243902439024, - "grad_norm": 3.150115728378296, - "learning_rate": 2.042841672346608e-06, - "loss": 0.0634, - "step": 1146 - }, - { - "epoch": 5.595121951219512, - "grad_norm": 2.742691993713379, - "learning_rate": 2.039075600644557e-06, - "loss": 0.0464, - "step": 1147 - }, - { - "epoch": 5.6, - "grad_norm": 2.733694076538086, - "learning_rate": 2.0353106114268824e-06, - "loss": 0.0829, - "step": 1148 - }, - { - "epoch": 5.6048780487804875, - "grad_norm": 2.511229991912842, - "learning_rate": 2.031546713535688e-06, - "loss": 0.0321, - "step": 1149 - }, - { - "epoch": 5.609756097560975, - "grad_norm": 3.019669532775879, - "learning_rate": 2.027783915810518e-06, - "loss": 0.05, - "step": 1150 - }, - { - "epoch": 5.614634146341463, - "grad_norm": 3.497159242630005, - "learning_rate": 2.024022227088329e-06, - "loss": 0.1984, - "step": 1151 - }, - { - "epoch": 5.619512195121951, - "grad_norm": 3.4637508392333984, - "learning_rate": 2.020261656203476e-06, - "loss": 0.1673, - "step": 1152 - }, - { - "epoch": 5.624390243902439, - "grad_norm": 2.4312477111816406, - "learning_rate": 2.016502211987687e-06, - "loss": 0.1106, - "step": 1153 - }, - { - "epoch": 5.6292682926829265, - "grad_norm": 2.7801673412323, - "learning_rate": 2.0127439032700446e-06, - "loss": 0.0374, - "step": 1154 - }, - { - "epoch": 5.634146341463414, - "grad_norm": 2.9346680641174316, - "learning_rate": 2.0089867388769664e-06, - "loss": 0.0674, - "step": 1155 - }, - { - "epoch": 5.639024390243902, - "grad_norm": 2.274888277053833, - "learning_rate": 2.0052307276321793e-06, - "loss": 0.0365, - "step": 1156 - }, - { - "epoch": 5.64390243902439, - "grad_norm": 3.069890022277832, - "learning_rate": 2.001475878356703e-06, - "loss": 0.0758, - "step": 1157 - }, - { - "epoch": 5.648780487804878, - "grad_norm": 3.8594915866851807, - "learning_rate": 1.99772219986883e-06, - "loss": 0.176, - "step": 1158 - }, - { - "epoch": 5.6536585365853655, - "grad_norm": 3.4886410236358643, - "learning_rate": 1.9939697009841024e-06, - "loss": 0.0491, - "step": 1159 - }, - { - "epoch": 5.658536585365853, - "grad_norm": 2.697946786880493, - "learning_rate": 1.990218390515291e-06, - "loss": 0.0741, - "step": 1160 - }, - { - "epoch": 5.663414634146341, - "grad_norm": 3.5290887355804443, - "learning_rate": 1.9864682772723757e-06, - "loss": 0.0826, - "step": 1161 - }, - { - "epoch": 5.668292682926829, - "grad_norm": 2.0601298809051514, - "learning_rate": 1.9827193700625274e-06, - "loss": 0.0378, - "step": 1162 - }, - { - "epoch": 5.673170731707317, - "grad_norm": 3.8458635807037354, - "learning_rate": 1.978971677690081e-06, - "loss": 0.2466, - "step": 1163 - }, - { - "epoch": 5.678048780487805, - "grad_norm": 2.788210153579712, - "learning_rate": 1.97522520895652e-06, - "loss": 0.0205, - "step": 1164 - }, - { - "epoch": 5.682926829268292, - "grad_norm": 3.1904587745666504, - "learning_rate": 1.971479972660454e-06, - "loss": 0.0998, - "step": 1165 - }, - { - "epoch": 5.68780487804878, - "grad_norm": 2.4664318561553955, - "learning_rate": 1.967735977597598e-06, - "loss": 0.0217, - "step": 1166 - }, - { - "epoch": 5.692682926829268, - "grad_norm": 2.1392667293548584, - "learning_rate": 1.9639932325607538e-06, - "loss": 0.048, - "step": 1167 - }, - { - "epoch": 5.697560975609756, - "grad_norm": 3.7127058506011963, - "learning_rate": 1.9602517463397845e-06, - "loss": 0.0302, - "step": 1168 - }, - { - "epoch": 5.702439024390244, - "grad_norm": 2.916168689727783, - "learning_rate": 1.9565115277215978e-06, - "loss": 0.0724, - "step": 1169 - }, - { - "epoch": 5.7073170731707314, - "grad_norm": 2.4352428913116455, - "learning_rate": 1.952772585490127e-06, - "loss": 0.0464, - "step": 1170 - }, - { - "epoch": 5.712195121951219, - "grad_norm": 2.8311455249786377, - "learning_rate": 1.9490349284263036e-06, - "loss": 0.0239, - "step": 1171 - }, - { - "epoch": 5.717073170731707, - "grad_norm": 3.3592801094055176, - "learning_rate": 1.9452985653080443e-06, - "loss": 0.0719, - "step": 1172 - }, - { - "epoch": 5.721951219512195, - "grad_norm": 2.450922966003418, - "learning_rate": 1.9415635049102245e-06, - "loss": 0.0408, - "step": 1173 - }, - { - "epoch": 5.726829268292683, - "grad_norm": 4.750118255615234, - "learning_rate": 1.937829756004662e-06, - "loss": 0.2049, - "step": 1174 - }, - { - "epoch": 5.7317073170731705, - "grad_norm": 3.0643811225891113, - "learning_rate": 1.9340973273600944e-06, - "loss": 0.0636, - "step": 1175 - }, - { - "epoch": 5.736585365853658, - "grad_norm": 3.313904047012329, - "learning_rate": 1.930366227742157e-06, - "loss": 0.1252, - "step": 1176 - }, - { - "epoch": 5.741463414634146, - "grad_norm": 3.8996808528900146, - "learning_rate": 1.9266364659133653e-06, - "loss": 0.0687, - "step": 1177 - }, - { - "epoch": 5.746341463414634, - "grad_norm": 2.727555274963379, - "learning_rate": 1.922908050633093e-06, - "loss": 0.0333, - "step": 1178 - }, - { - "epoch": 5.751219512195122, - "grad_norm": 3.270087718963623, - "learning_rate": 1.919180990657551e-06, - "loss": 0.0792, - "step": 1179 - }, - { - "epoch": 5.7560975609756095, - "grad_norm": 2.6631274223327637, - "learning_rate": 1.9154552947397668e-06, - "loss": 0.069, - "step": 1180 - }, - { - "epoch": 5.760975609756097, - "grad_norm": 4.4460554122924805, - "learning_rate": 1.9117309716295658e-06, - "loss": 0.115, - "step": 1181 - }, - { - "epoch": 5.765853658536585, - "grad_norm": 2.5652341842651367, - "learning_rate": 1.9080080300735478e-06, - "loss": 0.0537, - "step": 1182 - }, - { - "epoch": 5.770731707317073, - "grad_norm": 3.046436071395874, - "learning_rate": 1.9042864788150695e-06, - "loss": 0.0817, - "step": 1183 - }, - { - "epoch": 5.775609756097561, - "grad_norm": 2.121629238128662, - "learning_rate": 1.9005663265942206e-06, - "loss": 0.0289, - "step": 1184 - }, - { - "epoch": 5.780487804878049, - "grad_norm": 2.271918535232544, - "learning_rate": 1.8968475821478066e-06, - "loss": 0.0357, - "step": 1185 - }, - { - "epoch": 5.785365853658536, - "grad_norm": 2.582473039627075, - "learning_rate": 1.8931302542093274e-06, - "loss": 0.0584, - "step": 1186 - }, - { - "epoch": 5.790243902439024, - "grad_norm": 2.502952814102173, - "learning_rate": 1.8894143515089539e-06, - "loss": 0.0324, - "step": 1187 - }, - { - "epoch": 5.795121951219512, - "grad_norm": 1.9735453128814697, - "learning_rate": 1.8856998827735118e-06, - "loss": 0.0338, - "step": 1188 - }, - { - "epoch": 5.8, - "grad_norm": 4.441845893859863, - "learning_rate": 1.8819868567264588e-06, - "loss": 0.1706, - "step": 1189 - }, - { - "epoch": 5.804878048780488, - "grad_norm": 2.5450692176818848, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.0463, - "step": 1190 - }, - { - "epoch": 5.809756097560975, - "grad_norm": 3.718183755874634, - "learning_rate": 1.8745651675743876e-06, - "loss": 0.1188, - "step": 1191 - }, - { - "epoch": 5.814634146341463, - "grad_norm": 3.246532678604126, - "learning_rate": 1.870856521899261e-06, - "loss": 0.0984, - "step": 1192 - }, - { - "epoch": 5.819512195121951, - "grad_norm": 2.9522783756256104, - "learning_rate": 1.867149353772267e-06, - "loss": 0.0195, - "step": 1193 - }, - { - "epoch": 5.824390243902439, - "grad_norm": 2.3266429901123047, - "learning_rate": 1.863443671899717e-06, - "loss": 0.0236, - "step": 1194 - }, - { - "epoch": 5.829268292682927, - "grad_norm": 3.696749448776245, - "learning_rate": 1.8597394849844319e-06, - "loss": 0.1108, - "step": 1195 - }, - { - "epoch": 5.8341463414634145, - "grad_norm": 2.375624179840088, - "learning_rate": 1.8560368017257229e-06, - "loss": 0.0388, - "step": 1196 - }, - { - "epoch": 5.839024390243902, - "grad_norm": 4.0437092781066895, - "learning_rate": 1.8523356308193696e-06, - "loss": 0.3098, - "step": 1197 - }, - { - "epoch": 5.84390243902439, - "grad_norm": 3.165165424346924, - "learning_rate": 1.8486359809575977e-06, - "loss": 0.0775, - "step": 1198 - }, - { - "epoch": 5.848780487804878, - "grad_norm": 4.1991190910339355, - "learning_rate": 1.8449378608290638e-06, - "loss": 0.1222, - "step": 1199 - }, - { - "epoch": 5.853658536585366, - "grad_norm": 4.6657819747924805, - "learning_rate": 1.8412412791188306e-06, - "loss": 0.1146, - "step": 1200 - }, - { - "epoch": 5.8585365853658535, - "grad_norm": 4.569516181945801, - "learning_rate": 1.8375462445083464e-06, - "loss": 0.1113, - "step": 1201 - }, - { - "epoch": 5.863414634146341, - "grad_norm": 3.1565654277801514, - "learning_rate": 1.8338527656754285e-06, - "loss": 0.0416, - "step": 1202 - }, - { - "epoch": 5.868292682926829, - "grad_norm": 3.3474619388580322, - "learning_rate": 1.830160851294239e-06, - "loss": 0.0613, - "step": 1203 - }, - { - "epoch": 5.873170731707317, - "grad_norm": 4.30797004699707, - "learning_rate": 1.8264705100352662e-06, - "loss": 0.197, - "step": 1204 - }, - { - "epoch": 5.878048780487805, - "grad_norm": 2.7259573936462402, - "learning_rate": 1.8227817505653045e-06, - "loss": 0.0821, - "step": 1205 - }, - { - "epoch": 5.882926829268293, - "grad_norm": 3.515812873840332, - "learning_rate": 1.8190945815474323e-06, - "loss": 0.1246, - "step": 1206 - }, - { - "epoch": 5.88780487804878, - "grad_norm": 2.9223313331604004, - "learning_rate": 1.8154090116409934e-06, - "loss": 0.0703, - "step": 1207 - }, - { - "epoch": 5.892682926829268, - "grad_norm": 3.9529640674591064, - "learning_rate": 1.811725049501577e-06, - "loss": 0.1078, - "step": 1208 - }, - { - "epoch": 5.897560975609756, - "grad_norm": 4.1674580574035645, - "learning_rate": 1.8080427037809941e-06, - "loss": 0.1648, - "step": 1209 - }, - { - "epoch": 5.902439024390244, - "grad_norm": 3.1308021545410156, - "learning_rate": 1.8043619831272623e-06, - "loss": 0.061, - "step": 1210 - }, - { - "epoch": 5.907317073170732, - "grad_norm": 3.9667179584503174, - "learning_rate": 1.8006828961845807e-06, - "loss": 0.1863, - "step": 1211 - }, - { - "epoch": 5.912195121951219, - "grad_norm": 5.438168048858643, - "learning_rate": 1.7970054515933124e-06, - "loss": 0.2387, - "step": 1212 - }, - { - "epoch": 5.917073170731707, - "grad_norm": 5.505797863006592, - "learning_rate": 1.793329657989964e-06, - "loss": 0.2053, - "step": 1213 - }, - { - "epoch": 5.921951219512195, - "grad_norm": 2.8043150901794434, - "learning_rate": 1.7896555240071627e-06, - "loss": 0.026, - "step": 1214 - }, - { - "epoch": 5.926829268292683, - "grad_norm": 2.836164712905884, - "learning_rate": 1.7859830582736406e-06, - "loss": 0.0735, - "step": 1215 - }, - { - "epoch": 5.931707317073171, - "grad_norm": 2.8286306858062744, - "learning_rate": 1.782312269414211e-06, - "loss": 0.0586, - "step": 1216 - }, - { - "epoch": 5.9365853658536585, - "grad_norm": 4.4354329109191895, - "learning_rate": 1.7786431660497474e-06, - "loss": 0.3086, - "step": 1217 - }, - { - "epoch": 5.941463414634146, - "grad_norm": 4.0963640213012695, - "learning_rate": 1.7749757567971678e-06, - "loss": 0.0978, - "step": 1218 - }, - { - "epoch": 5.946341463414634, - "grad_norm": 2.726062536239624, - "learning_rate": 1.7713100502694091e-06, - "loss": 0.0976, - "step": 1219 - }, - { - "epoch": 5.951219512195122, - "grad_norm": 2.6566951274871826, - "learning_rate": 1.7676460550754104e-06, - "loss": 0.02, - "step": 1220 - }, - { - "epoch": 5.95609756097561, - "grad_norm": 2.7710952758789062, - "learning_rate": 1.7639837798200923e-06, - "loss": 0.0741, - "step": 1221 - }, - { - "epoch": 5.9609756097560975, - "grad_norm": 2.3678600788116455, - "learning_rate": 1.7603232331043346e-06, - "loss": 0.0542, - "step": 1222 - }, - { - "epoch": 5.965853658536585, - "grad_norm": 6.45259428024292, - "learning_rate": 1.7566644235249591e-06, - "loss": 0.3552, - "step": 1223 - }, - { - "epoch": 5.970731707317073, - "grad_norm": 1.8916475772857666, - "learning_rate": 1.7530073596747072e-06, - "loss": 0.0405, - "step": 1224 - }, - { - "epoch": 5.975609756097561, - "grad_norm": 2.1637566089630127, - "learning_rate": 1.74935205014222e-06, - "loss": 0.0178, - "step": 1225 - }, - { - "epoch": 5.980487804878049, - "grad_norm": 2.5959200859069824, - "learning_rate": 1.7456985035120194e-06, - "loss": 0.0264, - "step": 1226 - }, - { - "epoch": 5.985365853658537, - "grad_norm": 2.50264573097229, - "learning_rate": 1.7420467283644877e-06, - "loss": 0.0555, - "step": 1227 - }, - { - "epoch": 5.990243902439024, - "grad_norm": 2.4692020416259766, - "learning_rate": 1.738396733275844e-06, - "loss": 0.0546, - "step": 1228 - }, - { - "epoch": 5.995121951219512, - "grad_norm": 5.540846824645996, - "learning_rate": 1.7347485268181309e-06, - "loss": 0.1967, - "step": 1229 - }, - { - "epoch": 6.0, - "grad_norm": 1.8322839736938477, - "learning_rate": 1.7311021175591868e-06, - "loss": 0.0491, - "step": 1230 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.5409928540861235e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-1435/chat_template.jinja b/metallama3_8b/limo/checkpoint-1435/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-1435/config.json b/metallama3_8b/limo/checkpoint-1435/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-1435/generation_config.json b/metallama3_8b/limo/checkpoint-1435/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-1435/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1435/model-00001-of-00007.safetensors deleted file mode 100644 index 32bfcfa1c484dc0df86c65d0fb10a3bcb2bc090d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:257decce16baf82a843049ad762a87f7439a6bdb184c6204dd513c609896953c -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-1435/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1435/model-00002-of-00007.safetensors deleted file mode 100644 index 9e2b930608dc2f7be388f21330b610b98f8078c3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c2d574a8dbb3ff8b9a9b0ef3dd09973040f782d793e2bc99842f67ee884adc98 -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-1435/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1435/model-00003-of-00007.safetensors deleted file mode 100644 index 593128d4d6d18b1f592b111f89f8846fb3104712..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0cfb754a972a09fd70061999fd122c17e33e0ef3eaaecf489a04b0ab9e3a4d43 -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-1435/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1435/model-00004-of-00007.safetensors deleted file mode 100644 index 0e47b8b9bdcd895a45e4c277d30a370aa07dcdcf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28369c2d4f56479f76322be832b907e457129041a27964342f53537d5e5961cb -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-1435/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1435/model-00005-of-00007.safetensors deleted file mode 100644 index 14cff3ddf82706665bf03b016f79c8ade489ac8d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8a6727b7e2edaea643fb6545408104c0c73728a76ed726c7f3523a8c8f87bb8 -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-1435/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1435/model-00006-of-00007.safetensors deleted file mode 100644 index 9a0ea2fb7f19ad357d52deb0e964c00c60cc0de5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:674912e7a142673c30c5c5c0da3c29957e9b201003b3a2ac33a2a79a8b417048 -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-1435/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1435/model-00007-of-00007.safetensors deleted file mode 100644 index 8d8b84887c3d7e41d5de7ca3117e63a243a99800..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68d31f4c815246a9658b8877f5badc1071c68788065fea8209f16735291dc8eb -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-1435/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-1435/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-1435/rng_state_0.pth b/metallama3_8b/limo/checkpoint-1435/rng_state_0.pth deleted file mode 100644 index be2e24cc9d9ef8857272cec1451c810e205ec4e9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef002048764051a71fb00f8f978e9ec32b780dc850bdb059af362cc56494234b -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1435/rng_state_1.pth b/metallama3_8b/limo/checkpoint-1435/rng_state_1.pth deleted file mode 100644 index efcf4dd2e74596ac28af81f9f8bd0be9a807deb3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37194a6d48612e1a46a2d5d317ead97c70d9fc4569b0118fcd5f84c3dc9daa5a -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1435/rng_state_2.pth b/metallama3_8b/limo/checkpoint-1435/rng_state_2.pth deleted file mode 100644 index 4c9222e37d4e9d1745c0e126e0fe0c4a348e298d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:17c179483659a784aa1ace2427daff48c556a6bcc3c330e6f3274e4dc95e4b49 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1435/rng_state_3.pth b/metallama3_8b/limo/checkpoint-1435/rng_state_3.pth deleted file mode 100644 index 7821bf0f5f0621fd0159152432f0a7bc66aa6823..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b56857c9b117629f35af2c3d64f522d33a9d8aa94faa81ec6956380a895118c4 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1435/scheduler.pt b/metallama3_8b/limo/checkpoint-1435/scheduler.pt deleted file mode 100644 index 245a5a0b67ff87b54a20213f6b8aacb9d2f36219..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fdad6b9516b30f2939653936ed8ada3abce4dacd244729258cb79512d2e56957 -size 1064 diff --git a/metallama3_8b/limo/checkpoint-1435/special_tokens_map.json b/metallama3_8b/limo/checkpoint-1435/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-1435/tokenizer.json b/metallama3_8b/limo/checkpoint-1435/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-1435/tokenizer_config.json b/metallama3_8b/limo/checkpoint-1435/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-1435/trainer_state.json b/metallama3_8b/limo/checkpoint-1435/trainer_state.json deleted file mode 100644 index 7c92fee63c2833dea17a750774b4733276126401..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1435/trainer_state.json +++ /dev/null @@ -1,10079 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 7.0, - "eval_steps": 500, - "global_step": 1435, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - }, - { - "epoch": 3.004878048780488, - "grad_norm": 2.787429094314575, - "learning_rate": 3.969463130731183e-06, - "loss": 0.2403, - "step": 616 - }, - { - "epoch": 3.0097560975609756, - "grad_norm": 3.0412075519561768, - "learning_rate": 3.966361891975316e-06, - "loss": 0.2635, - "step": 617 - }, - { - "epoch": 3.0146341463414634, - "grad_norm": 2.9949851036071777, - "learning_rate": 3.963257209457652e-06, - "loss": 0.3294, - "step": 618 - }, - { - "epoch": 3.0195121951219512, - "grad_norm": 3.0510809421539307, - "learning_rate": 3.960149090469561e-06, - "loss": 0.1338, - "step": 619 - }, - { - "epoch": 3.024390243902439, - "grad_norm": 3.669482707977295, - "learning_rate": 3.957037542310484e-06, - "loss": 0.1469, - "step": 620 - }, - { - "epoch": 3.029268292682927, - "grad_norm": 4.677116870880127, - "learning_rate": 3.953922572287915e-06, - "loss": 0.2788, - "step": 621 - }, - { - "epoch": 3.0341463414634147, - "grad_norm": 4.33144474029541, - "learning_rate": 3.950804187717384e-06, - "loss": 0.4521, - "step": 622 - }, - { - "epoch": 3.0390243902439025, - "grad_norm": 3.466639757156372, - "learning_rate": 3.947682395922439e-06, - "loss": 0.5113, - "step": 623 - }, - { - "epoch": 3.0439024390243903, - "grad_norm": 3.2332122325897217, - "learning_rate": 3.9445572042346346e-06, - "loss": 0.0968, - "step": 624 - }, - { - "epoch": 3.048780487804878, - "grad_norm": 2.6108055114746094, - "learning_rate": 3.941428619993505e-06, - "loss": 0.2462, - "step": 625 - }, - { - "epoch": 3.053658536585366, - "grad_norm": 3.2512595653533936, - "learning_rate": 3.938296650546552e-06, - "loss": 0.1782, - "step": 626 - }, - { - "epoch": 3.0585365853658537, - "grad_norm": 3.4350366592407227, - "learning_rate": 3.935161303249231e-06, - "loss": 0.2955, - "step": 627 - }, - { - "epoch": 3.0634146341463415, - "grad_norm": 3.42012619972229, - "learning_rate": 3.932022585464928e-06, - "loss": 0.3259, - "step": 628 - }, - { - "epoch": 3.0682926829268293, - "grad_norm": 3.458043336868286, - "learning_rate": 3.928880504564943e-06, - "loss": 0.2306, - "step": 629 - }, - { - "epoch": 3.073170731707317, - "grad_norm": 2.646616220474243, - "learning_rate": 3.92573506792848e-06, - "loss": 0.2197, - "step": 630 - }, - { - "epoch": 3.078048780487805, - "grad_norm": 3.5558857917785645, - "learning_rate": 3.9225862829426184e-06, - "loss": 0.1607, - "step": 631 - }, - { - "epoch": 3.0829268292682928, - "grad_norm": 3.6011338233947754, - "learning_rate": 3.919434157002303e-06, - "loss": 0.3087, - "step": 632 - }, - { - "epoch": 3.0878048780487806, - "grad_norm": 2.339879035949707, - "learning_rate": 3.916278697510325e-06, - "loss": 0.2213, - "step": 633 - }, - { - "epoch": 3.0926829268292684, - "grad_norm": 3.268162488937378, - "learning_rate": 3.913119911877305e-06, - "loss": 0.318, - "step": 634 - }, - { - "epoch": 3.097560975609756, - "grad_norm": 4.062571048736572, - "learning_rate": 3.909957807521674e-06, - "loss": 0.1757, - "step": 635 - }, - { - "epoch": 3.102439024390244, - "grad_norm": 2.997659683227539, - "learning_rate": 3.906792391869657e-06, - "loss": 0.2391, - "step": 636 - }, - { - "epoch": 3.107317073170732, - "grad_norm": 3.7037394046783447, - "learning_rate": 3.903623672355258e-06, - "loss": 0.2548, - "step": 637 - }, - { - "epoch": 3.1121951219512196, - "grad_norm": 3.110579252243042, - "learning_rate": 3.900451656420237e-06, - "loss": 0.2389, - "step": 638 - }, - { - "epoch": 3.1170731707317074, - "grad_norm": 3.3332321643829346, - "learning_rate": 3.897276351514097e-06, - "loss": 0.1371, - "step": 639 - }, - { - "epoch": 3.1219512195121952, - "grad_norm": 3.8275935649871826, - "learning_rate": 3.894097765094065e-06, - "loss": 0.3363, - "step": 640 - }, - { - "epoch": 3.126829268292683, - "grad_norm": 2.3731374740600586, - "learning_rate": 3.890915904625075e-06, - "loss": 0.1314, - "step": 641 - }, - { - "epoch": 3.131707317073171, - "grad_norm": 3.1511282920837402, - "learning_rate": 3.887730777579751e-06, - "loss": 0.3563, - "step": 642 - }, - { - "epoch": 3.1365853658536587, - "grad_norm": 4.2254862785339355, - "learning_rate": 3.884542391438387e-06, - "loss": 0.5053, - "step": 643 - }, - { - "epoch": 3.1414634146341465, - "grad_norm": 4.579670429229736, - "learning_rate": 3.88135075368893e-06, - "loss": 0.6259, - "step": 644 - }, - { - "epoch": 3.1463414634146343, - "grad_norm": 3.2102746963500977, - "learning_rate": 3.878155871826968e-06, - "loss": 0.2599, - "step": 645 - }, - { - "epoch": 3.151219512195122, - "grad_norm": 2.5569686889648438, - "learning_rate": 3.874957753355701e-06, - "loss": 0.2075, - "step": 646 - }, - { - "epoch": 3.15609756097561, - "grad_norm": 3.588925838470459, - "learning_rate": 3.8717564057859365e-06, - "loss": 0.4577, - "step": 647 - }, - { - "epoch": 3.1609756097560977, - "grad_norm": 3.6163878440856934, - "learning_rate": 3.868551836636063e-06, - "loss": 0.4023, - "step": 648 - }, - { - "epoch": 3.1658536585365855, - "grad_norm": 3.8688390254974365, - "learning_rate": 3.865344053432035e-06, - "loss": 0.1669, - "step": 649 - }, - { - "epoch": 3.1707317073170733, - "grad_norm": 3.419734001159668, - "learning_rate": 3.862133063707353e-06, - "loss": 0.2766, - "step": 650 - }, - { - "epoch": 3.175609756097561, - "grad_norm": 2.9860243797302246, - "learning_rate": 3.858918875003053e-06, - "loss": 0.1788, - "step": 651 - }, - { - "epoch": 3.180487804878049, - "grad_norm": 3.0619022846221924, - "learning_rate": 3.855701494867679e-06, - "loss": 0.224, - "step": 652 - }, - { - "epoch": 3.1853658536585368, - "grad_norm": 3.3668978214263916, - "learning_rate": 3.852480930857275e-06, - "loss": 0.4029, - "step": 653 - }, - { - "epoch": 3.1902439024390246, - "grad_norm": 3.543147563934326, - "learning_rate": 3.849257190535356e-06, - "loss": 0.2096, - "step": 654 - }, - { - "epoch": 3.1951219512195124, - "grad_norm": 3.793619155883789, - "learning_rate": 3.846030281472902e-06, - "loss": 0.5574, - "step": 655 - }, - { - "epoch": 3.2, - "grad_norm": 3.021289110183716, - "learning_rate": 3.842800211248333e-06, - "loss": 0.2233, - "step": 656 - }, - { - "epoch": 3.204878048780488, - "grad_norm": 4.582934856414795, - "learning_rate": 3.839566987447492e-06, - "loss": 0.3871, - "step": 657 - }, - { - "epoch": 3.209756097560976, - "grad_norm": 2.996340274810791, - "learning_rate": 3.8363306176636296e-06, - "loss": 0.4325, - "step": 658 - }, - { - "epoch": 3.2146341463414636, - "grad_norm": 3.3190877437591553, - "learning_rate": 3.833091109497384e-06, - "loss": 0.5321, - "step": 659 - }, - { - "epoch": 3.2195121951219514, - "grad_norm": 3.2532856464385986, - "learning_rate": 3.829848470556765e-06, - "loss": 0.1359, - "step": 660 - }, - { - "epoch": 3.2243902439024392, - "grad_norm": 2.7875044345855713, - "learning_rate": 3.8266027084571335e-06, - "loss": 0.3145, - "step": 661 - }, - { - "epoch": 3.229268292682927, - "grad_norm": 3.748253583908081, - "learning_rate": 3.823353830821187e-06, - "loss": 0.1252, - "step": 662 - }, - { - "epoch": 3.234146341463415, - "grad_norm": 2.858293294906616, - "learning_rate": 3.820101845278937e-06, - "loss": 0.2589, - "step": 663 - }, - { - "epoch": 3.2390243902439027, - "grad_norm": 3.7470967769622803, - "learning_rate": 3.816846759467696e-06, - "loss": 0.2594, - "step": 664 - }, - { - "epoch": 3.2439024390243905, - "grad_norm": 3.676196813583374, - "learning_rate": 3.8135885810320587e-06, - "loss": 0.2998, - "step": 665 - }, - { - "epoch": 3.2487804878048783, - "grad_norm": 3.0943140983581543, - "learning_rate": 3.810327317623881e-06, - "loss": 0.2238, - "step": 666 - }, - { - "epoch": 3.253658536585366, - "grad_norm": 3.5907349586486816, - "learning_rate": 3.8070629769022628e-06, - "loss": 0.3381, - "step": 667 - }, - { - "epoch": 3.258536585365854, - "grad_norm": 3.1195285320281982, - "learning_rate": 3.8037955665335335e-06, - "loss": 0.2407, - "step": 668 - }, - { - "epoch": 3.2634146341463417, - "grad_norm": 3.422292947769165, - "learning_rate": 3.800525094191231e-06, - "loss": 0.2957, - "step": 669 - }, - { - "epoch": 3.2682926829268295, - "grad_norm": 2.5264663696289062, - "learning_rate": 3.797251567556083e-06, - "loss": 0.2493, - "step": 670 - }, - { - "epoch": 3.2731707317073173, - "grad_norm": 3.350219964981079, - "learning_rate": 3.793974994315991e-06, - "loss": 0.1186, - "step": 671 - }, - { - "epoch": 3.278048780487805, - "grad_norm": 4.175906181335449, - "learning_rate": 3.790695382166013e-06, - "loss": 0.3453, - "step": 672 - }, - { - "epoch": 3.2829268292682925, - "grad_norm": 3.006072521209717, - "learning_rate": 3.7874127388083415e-06, - "loss": 0.1981, - "step": 673 - }, - { - "epoch": 3.2878048780487803, - "grad_norm": 3.368561029434204, - "learning_rate": 3.7841270719522895e-06, - "loss": 0.2934, - "step": 674 - }, - { - "epoch": 3.292682926829268, - "grad_norm": 4.374331951141357, - "learning_rate": 3.7808383893142692e-06, - "loss": 0.1359, - "step": 675 - }, - { - "epoch": 3.297560975609756, - "grad_norm": 3.297102451324463, - "learning_rate": 3.7775466986177763e-06, - "loss": 0.2498, - "step": 676 - }, - { - "epoch": 3.3024390243902437, - "grad_norm": 2.8914761543273926, - "learning_rate": 3.774252007593371e-06, - "loss": 0.1308, - "step": 677 - }, - { - "epoch": 3.3073170731707315, - "grad_norm": 3.1550722122192383, - "learning_rate": 3.7709543239786593e-06, - "loss": 0.3915, - "step": 678 - }, - { - "epoch": 3.3121951219512193, - "grad_norm": 3.2302658557891846, - "learning_rate": 3.767653655518277e-06, - "loss": 0.2558, - "step": 679 - }, - { - "epoch": 3.317073170731707, - "grad_norm": 4.4321770668029785, - "learning_rate": 3.7643500099638673e-06, - "loss": 0.1988, - "step": 680 - }, - { - "epoch": 3.321951219512195, - "grad_norm": 2.970566749572754, - "learning_rate": 3.7610433950740667e-06, - "loss": 0.4908, - "step": 681 - }, - { - "epoch": 3.3268292682926828, - "grad_norm": 3.5516228675842285, - "learning_rate": 3.757733818614485e-06, - "loss": 0.304, - "step": 682 - }, - { - "epoch": 3.3317073170731706, - "grad_norm": 2.7555387020111084, - "learning_rate": 3.7544212883576856e-06, - "loss": 0.2533, - "step": 683 - }, - { - "epoch": 3.3365853658536584, - "grad_norm": 3.61226749420166, - "learning_rate": 3.751105812083172e-06, - "loss": 0.1771, - "step": 684 - }, - { - "epoch": 3.341463414634146, - "grad_norm": 3.0466206073760986, - "learning_rate": 3.7477873975773655e-06, - "loss": 0.4213, - "step": 685 - }, - { - "epoch": 3.346341463414634, - "grad_norm": 3.6091527938842773, - "learning_rate": 3.7444660526335853e-06, - "loss": 0.3808, - "step": 686 - }, - { - "epoch": 3.351219512195122, - "grad_norm": 3.8443002700805664, - "learning_rate": 3.741141785052036e-06, - "loss": 0.6438, - "step": 687 - }, - { - "epoch": 3.3560975609756096, - "grad_norm": 3.845909833908081, - "learning_rate": 3.737814602639784e-06, - "loss": 0.3686, - "step": 688 - }, - { - "epoch": 3.3609756097560974, - "grad_norm": 2.904892921447754, - "learning_rate": 3.7344845132107427e-06, - "loss": 0.2934, - "step": 689 - }, - { - "epoch": 3.3658536585365852, - "grad_norm": 3.4766387939453125, - "learning_rate": 3.731151524585651e-06, - "loss": 0.3299, - "step": 690 - }, - { - "epoch": 3.370731707317073, - "grad_norm": 4.236767768859863, - "learning_rate": 3.7278156445920584e-06, - "loss": 0.6303, - "step": 691 - }, - { - "epoch": 3.375609756097561, - "grad_norm": 3.1122591495513916, - "learning_rate": 3.724476881064303e-06, - "loss": 0.2432, - "step": 692 - }, - { - "epoch": 3.3804878048780487, - "grad_norm": 3.0971457958221436, - "learning_rate": 3.721135241843496e-06, - "loss": 0.3131, - "step": 693 - }, - { - "epoch": 3.3853658536585365, - "grad_norm": 3.9365804195404053, - "learning_rate": 3.7177907347775016e-06, - "loss": 0.3372, - "step": 694 - }, - { - "epoch": 3.3902439024390243, - "grad_norm": 3.760373115539551, - "learning_rate": 3.71444336772092e-06, - "loss": 0.5055, - "step": 695 - }, - { - "epoch": 3.395121951219512, - "grad_norm": 4.360848426818848, - "learning_rate": 3.711093148535068e-06, - "loss": 0.6183, - "step": 696 - }, - { - "epoch": 3.4, - "grad_norm": 3.7713537216186523, - "learning_rate": 3.707740085087959e-06, - "loss": 0.1568, - "step": 697 - }, - { - "epoch": 3.4048780487804877, - "grad_norm": 3.8532230854034424, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.2826, - "step": 698 - }, - { - "epoch": 3.4097560975609755, - "grad_norm": 3.0548605918884277, - "learning_rate": 3.701025456915411e-06, - "loss": 0.1918, - "step": 699 - }, - { - "epoch": 3.4146341463414633, - "grad_norm": 3.2431821823120117, - "learning_rate": 3.697663907959327e-06, - "loss": 0.2493, - "step": 700 - }, - { - "epoch": 3.419512195121951, - "grad_norm": 3.7301864624023438, - "learning_rate": 3.6942995462806574e-06, - "loss": 0.4913, - "step": 701 - }, - { - "epoch": 3.424390243902439, - "grad_norm": 2.5468900203704834, - "learning_rate": 3.6909323797806314e-06, - "loss": 0.1788, - "step": 702 - }, - { - "epoch": 3.4292682926829268, - "grad_norm": 3.3719515800476074, - "learning_rate": 3.6875624163670635e-06, - "loss": 0.4162, - "step": 703 - }, - { - "epoch": 3.4341463414634146, - "grad_norm": 3.528010368347168, - "learning_rate": 3.6841896639543394e-06, - "loss": 0.1924, - "step": 704 - }, - { - "epoch": 3.4390243902439024, - "grad_norm": 3.3636631965637207, - "learning_rate": 3.6808141304633924e-06, - "loss": 0.3177, - "step": 705 - }, - { - "epoch": 3.44390243902439, - "grad_norm": 3.418705463409424, - "learning_rate": 3.6774358238216878e-06, - "loss": 0.2301, - "step": 706 - }, - { - "epoch": 3.448780487804878, - "grad_norm": 4.720373630523682, - "learning_rate": 3.6740547519632048e-06, - "loss": 0.1894, - "step": 707 - }, - { - "epoch": 3.453658536585366, - "grad_norm": 2.9635703563690186, - "learning_rate": 3.670670922828414e-06, - "loss": 0.2642, - "step": 708 - }, - { - "epoch": 3.4585365853658536, - "grad_norm": 4.934754371643066, - "learning_rate": 3.667284344364264e-06, - "loss": 0.2275, - "step": 709 - }, - { - "epoch": 3.4634146341463414, - "grad_norm": 3.090585231781006, - "learning_rate": 3.6638950245241604e-06, - "loss": 0.4447, - "step": 710 - }, - { - "epoch": 3.4682926829268292, - "grad_norm": 4.360495090484619, - "learning_rate": 3.660502971267945e-06, - "loss": 0.2415, - "step": 711 - }, - { - "epoch": 3.473170731707317, - "grad_norm": 3.4893476963043213, - "learning_rate": 3.65710819256188e-06, - "loss": 0.0921, - "step": 712 - }, - { - "epoch": 3.478048780487805, - "grad_norm": 3.2423770427703857, - "learning_rate": 3.65371069637863e-06, - "loss": 0.2371, - "step": 713 - }, - { - "epoch": 3.4829268292682927, - "grad_norm": 3.0775890350341797, - "learning_rate": 3.650310490697238e-06, - "loss": 0.4026, - "step": 714 - }, - { - "epoch": 3.4878048780487805, - "grad_norm": 3.906625270843506, - "learning_rate": 3.646907583503114e-06, - "loss": 0.4312, - "step": 715 - }, - { - "epoch": 3.4926829268292683, - "grad_norm": 3.2140414714813232, - "learning_rate": 3.6435019827880093e-06, - "loss": 0.2309, - "step": 716 - }, - { - "epoch": 3.497560975609756, - "grad_norm": 3.048523426055908, - "learning_rate": 3.640093696550003e-06, - "loss": 0.296, - "step": 717 - }, - { - "epoch": 3.502439024390244, - "grad_norm": 2.9669039249420166, - "learning_rate": 3.6366827327934817e-06, - "loss": 0.2723, - "step": 718 - }, - { - "epoch": 3.5073170731707317, - "grad_norm": 3.6941726207733154, - "learning_rate": 3.6332690995291176e-06, - "loss": 0.3797, - "step": 719 - }, - { - "epoch": 3.5121951219512195, - "grad_norm": 5.135766506195068, - "learning_rate": 3.6298528047738545e-06, - "loss": 0.9868, - "step": 720 - }, - { - "epoch": 3.5170731707317073, - "grad_norm": 3.2021052837371826, - "learning_rate": 3.626433856550886e-06, - "loss": 0.4069, - "step": 721 - }, - { - "epoch": 3.521951219512195, - "grad_norm": 3.094444513320923, - "learning_rate": 3.623012262889637e-06, - "loss": 0.3368, - "step": 722 - }, - { - "epoch": 3.526829268292683, - "grad_norm": 3.609285354614258, - "learning_rate": 3.6195880318257465e-06, - "loss": 0.3972, - "step": 723 - }, - { - "epoch": 3.5317073170731708, - "grad_norm": 4.236501216888428, - "learning_rate": 3.616161171401046e-06, - "loss": 0.52, - "step": 724 - }, - { - "epoch": 3.5365853658536586, - "grad_norm": 3.504526376724243, - "learning_rate": 3.612731689663542e-06, - "loss": 0.23, - "step": 725 - }, - { - "epoch": 3.5414634146341464, - "grad_norm": 3.233591079711914, - "learning_rate": 3.6092995946673996e-06, - "loss": 0.4151, - "step": 726 - }, - { - "epoch": 3.546341463414634, - "grad_norm": 3.6701886653900146, - "learning_rate": 3.605864894472918e-06, - "loss": 0.2798, - "step": 727 - }, - { - "epoch": 3.551219512195122, - "grad_norm": 3.8713181018829346, - "learning_rate": 3.602427597146516e-06, - "loss": 0.4336, - "step": 728 - }, - { - "epoch": 3.55609756097561, - "grad_norm": 5.49612283706665, - "learning_rate": 3.5989877107607134e-06, - "loss": 0.4803, - "step": 729 - }, - { - "epoch": 3.5609756097560976, - "grad_norm": 3.771005392074585, - "learning_rate": 3.5955452433941075e-06, - "loss": 0.3698, - "step": 730 - }, - { - "epoch": 3.5658536585365854, - "grad_norm": 2.970822334289551, - "learning_rate": 3.5921002031313586e-06, - "loss": 0.2373, - "step": 731 - }, - { - "epoch": 3.5707317073170732, - "grad_norm": 3.517249584197998, - "learning_rate": 3.58865259806317e-06, - "loss": 0.1908, - "step": 732 - }, - { - "epoch": 3.575609756097561, - "grad_norm": 3.6825428009033203, - "learning_rate": 3.585202436286267e-06, - "loss": 0.3993, - "step": 733 - }, - { - "epoch": 3.580487804878049, - "grad_norm": 3.387479066848755, - "learning_rate": 3.581749725903381e-06, - "loss": 0.4237, - "step": 734 - }, - { - "epoch": 3.5853658536585367, - "grad_norm": 3.5004806518554688, - "learning_rate": 3.5782944750232274e-06, - "loss": 0.3011, - "step": 735 - }, - { - "epoch": 3.5902439024390245, - "grad_norm": 3.461731433868408, - "learning_rate": 3.574836691760489e-06, - "loss": 0.0896, - "step": 736 - }, - { - "epoch": 3.5951219512195123, - "grad_norm": 3.9598381519317627, - "learning_rate": 3.571376384235795e-06, - "loss": 0.2751, - "step": 737 - }, - { - "epoch": 3.6, - "grad_norm": 4.053933143615723, - "learning_rate": 3.5679135605757035e-06, - "loss": 0.2086, - "step": 738 - }, - { - "epoch": 3.604878048780488, - "grad_norm": 2.9683544635772705, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1659, - "step": 739 - }, - { - "epoch": 3.6097560975609757, - "grad_norm": 3.6598448753356934, - "learning_rate": 3.5609803973850877e-06, - "loss": 0.2469, - "step": 740 - }, - { - "epoch": 3.6146341463414635, - "grad_norm": 3.449335813522339, - "learning_rate": 3.557510074137147e-06, - "loss": 0.375, - "step": 741 - }, - { - "epoch": 3.6195121951219513, - "grad_norm": 2.7666923999786377, - "learning_rate": 3.554037267318942e-06, - "loss": 0.3133, - "step": 742 - }, - { - "epoch": 3.624390243902439, - "grad_norm": 2.8951869010925293, - "learning_rate": 3.5505619850863847e-06, - "loss": 0.2243, - "step": 743 - }, - { - "epoch": 3.629268292682927, - "grad_norm": 3.477747678756714, - "learning_rate": 3.5470842356012007e-06, - "loss": 0.1321, - "step": 744 - }, - { - "epoch": 3.6341463414634148, - "grad_norm": 3.810480833053589, - "learning_rate": 3.5436040270309113e-06, - "loss": 0.361, - "step": 745 - }, - { - "epoch": 3.6390243902439026, - "grad_norm": 3.0730793476104736, - "learning_rate": 3.540121367548811e-06, - "loss": 0.1523, - "step": 746 - }, - { - "epoch": 3.6439024390243904, - "grad_norm": 3.6878390312194824, - "learning_rate": 3.5366362653339524e-06, - "loss": 0.4898, - "step": 747 - }, - { - "epoch": 3.648780487804878, - "grad_norm": 3.6432242393493652, - "learning_rate": 3.533148728571124e-06, - "loss": 0.1397, - "step": 748 - }, - { - "epoch": 3.653658536585366, - "grad_norm": 3.7047760486602783, - "learning_rate": 3.5296587654508317e-06, - "loss": 0.323, - "step": 749 - }, - { - "epoch": 3.658536585365854, - "grad_norm": 3.777132749557495, - "learning_rate": 3.526166384169279e-06, - "loss": 0.5577, - "step": 750 - }, - { - "epoch": 3.6634146341463416, - "grad_norm": 3.7970924377441406, - "learning_rate": 3.5226715929283507e-06, - "loss": 0.245, - "step": 751 - }, - { - "epoch": 3.6682926829268294, - "grad_norm": 2.8203537464141846, - "learning_rate": 3.519174399935588e-06, - "loss": 0.1619, - "step": 752 - }, - { - "epoch": 3.6731707317073172, - "grad_norm": 3.4040987491607666, - "learning_rate": 3.5156748134041767e-06, - "loss": 0.1047, - "step": 753 - }, - { - "epoch": 3.678048780487805, - "grad_norm": 3.927960157394409, - "learning_rate": 3.5121728415529203e-06, - "loss": 0.5713, - "step": 754 - }, - { - "epoch": 3.682926829268293, - "grad_norm": 3.3833277225494385, - "learning_rate": 3.5086684926062266e-06, - "loss": 0.2174, - "step": 755 - }, - { - "epoch": 3.68780487804878, - "grad_norm": 3.989307403564453, - "learning_rate": 3.505161774794085e-06, - "loss": 0.285, - "step": 756 - }, - { - "epoch": 3.692682926829268, - "grad_norm": 2.742429494857788, - "learning_rate": 3.5016526963520474e-06, - "loss": 0.1602, - "step": 757 - }, - { - "epoch": 3.697560975609756, - "grad_norm": 3.7082698345184326, - "learning_rate": 3.498141265521212e-06, - "loss": 0.666, - "step": 758 - }, - { - "epoch": 3.7024390243902436, - "grad_norm": 3.033196210861206, - "learning_rate": 3.4946274905481997e-06, - "loss": 0.2024, - "step": 759 - }, - { - "epoch": 3.7073170731707314, - "grad_norm": 3.7145371437072754, - "learning_rate": 3.4911113796851364e-06, - "loss": 0.2719, - "step": 760 - }, - { - "epoch": 3.7121951219512193, - "grad_norm": 3.580298900604248, - "learning_rate": 3.487592941189636e-06, - "loss": 0.1537, - "step": 761 - }, - { - "epoch": 3.717073170731707, - "grad_norm": 4.753757953643799, - "learning_rate": 3.484072183324776e-06, - "loss": 0.6149, - "step": 762 - }, - { - "epoch": 3.721951219512195, - "grad_norm": 3.5575687885284424, - "learning_rate": 3.4805491143590823e-06, - "loss": 0.4241, - "step": 763 - }, - { - "epoch": 3.7268292682926827, - "grad_norm": 3.215224266052246, - "learning_rate": 3.4770237425665103e-06, - "loss": 0.3037, - "step": 764 - }, - { - "epoch": 3.7317073170731705, - "grad_norm": 2.9899685382843018, - "learning_rate": 3.4734960762264204e-06, - "loss": 0.4854, - "step": 765 - }, - { - "epoch": 3.7365853658536583, - "grad_norm": 3.5880227088928223, - "learning_rate": 3.469966123623563e-06, - "loss": 0.3849, - "step": 766 - }, - { - "epoch": 3.741463414634146, - "grad_norm": 3.472750186920166, - "learning_rate": 3.46643389304806e-06, - "loss": 0.3159, - "step": 767 - }, - { - "epoch": 3.746341463414634, - "grad_norm": 4.355650901794434, - "learning_rate": 3.4628993927953786e-06, - "loss": 0.7527, - "step": 768 - }, - { - "epoch": 3.7512195121951217, - "grad_norm": 2.94575834274292, - "learning_rate": 3.45936263116632e-06, - "loss": 0.1716, - "step": 769 - }, - { - "epoch": 3.7560975609756095, - "grad_norm": 2.991525173187256, - "learning_rate": 3.4558236164669957e-06, - "loss": 0.2061, - "step": 770 - }, - { - "epoch": 3.7609756097560973, - "grad_norm": 3.134000301361084, - "learning_rate": 3.4522823570088073e-06, - "loss": 0.1338, - "step": 771 - }, - { - "epoch": 3.765853658536585, - "grad_norm": 3.722140312194824, - "learning_rate": 3.4487388611084295e-06, - "loss": 0.2615, - "step": 772 - }, - { - "epoch": 3.770731707317073, - "grad_norm": 3.7941153049468994, - "learning_rate": 3.445193137087788e-06, - "loss": 0.1401, - "step": 773 - }, - { - "epoch": 3.7756097560975608, - "grad_norm": 2.872941732406616, - "learning_rate": 3.4416451932740424e-06, - "loss": 0.2934, - "step": 774 - }, - { - "epoch": 3.7804878048780486, - "grad_norm": 4.5019941329956055, - "learning_rate": 3.4380950379995652e-06, - "loss": 0.4579, - "step": 775 - }, - { - "epoch": 3.7853658536585364, - "grad_norm": 2.682884931564331, - "learning_rate": 3.434542679601922e-06, - "loss": 0.2979, - "step": 776 - }, - { - "epoch": 3.790243902439024, - "grad_norm": 3.3044273853302, - "learning_rate": 3.4309881264238538e-06, - "loss": 0.1196, - "step": 777 - }, - { - "epoch": 3.795121951219512, - "grad_norm": 3.102760076522827, - "learning_rate": 3.4274313868132547e-06, - "loss": 0.2026, - "step": 778 - }, - { - "epoch": 3.8, - "grad_norm": 3.3304500579833984, - "learning_rate": 3.4238724691231534e-06, - "loss": 0.2135, - "step": 779 - }, - { - "epoch": 3.8048780487804876, - "grad_norm": 3.295119047164917, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.4418, - "step": 780 - }, - { - "epoch": 3.8097560975609754, - "grad_norm": 3.6655640602111816, - "learning_rate": 3.4167481329421204e-06, - "loss": 0.203, - "step": 781 - }, - { - "epoch": 3.8146341463414632, - "grad_norm": 3.387830972671509, - "learning_rate": 3.4131827311827447e-06, - "loss": 0.3225, - "step": 782 - }, - { - "epoch": 3.819512195121951, - "grad_norm": 2.621633529663086, - "learning_rate": 3.4096151848069416e-06, - "loss": 0.1704, - "step": 783 - }, - { - "epoch": 3.824390243902439, - "grad_norm": 2.974344491958618, - "learning_rate": 3.4060455021931195e-06, - "loss": 0.2785, - "step": 784 - }, - { - "epoch": 3.8292682926829267, - "grad_norm": 3.452131748199463, - "learning_rate": 3.402473691724704e-06, - "loss": 0.223, - "step": 785 - }, - { - "epoch": 3.8341463414634145, - "grad_norm": 2.6373705863952637, - "learning_rate": 3.39889976179012e-06, - "loss": 0.2368, - "step": 786 - }, - { - "epoch": 3.8390243902439023, - "grad_norm": 2.863184928894043, - "learning_rate": 3.3953237207827673e-06, - "loss": 0.3294, - "step": 787 - }, - { - "epoch": 3.84390243902439, - "grad_norm": 5.104704856872559, - "learning_rate": 3.391745577101005e-06, - "loss": 0.5431, - "step": 788 - }, - { - "epoch": 3.848780487804878, - "grad_norm": 3.951310634613037, - "learning_rate": 3.3881653391481306e-06, - "loss": 0.2546, - "step": 789 - }, - { - "epoch": 3.8536585365853657, - "grad_norm": 3.9903225898742676, - "learning_rate": 3.384583015332359e-06, - "loss": 0.3293, - "step": 790 - }, - { - "epoch": 3.8585365853658535, - "grad_norm": 3.3149220943450928, - "learning_rate": 3.380998614066805e-06, - "loss": 0.1861, - "step": 791 - }, - { - "epoch": 3.8634146341463413, - "grad_norm": 3.6755223274230957, - "learning_rate": 3.3774121437694606e-06, - "loss": 0.2498, - "step": 792 - }, - { - "epoch": 3.868292682926829, - "grad_norm": 3.192918300628662, - "learning_rate": 3.3738236128631786e-06, - "loss": 0.1525, - "step": 793 - }, - { - "epoch": 3.873170731707317, - "grad_norm": 3.5358777046203613, - "learning_rate": 3.3702330297756503e-06, - "loss": 0.3622, - "step": 794 - }, - { - "epoch": 3.8780487804878048, - "grad_norm": 3.619878053665161, - "learning_rate": 3.366640402939387e-06, - "loss": 0.1051, - "step": 795 - }, - { - "epoch": 3.8829268292682926, - "grad_norm": 7.085352420806885, - "learning_rate": 3.363045740791698e-06, - "loss": 0.4606, - "step": 796 - }, - { - "epoch": 3.8878048780487804, - "grad_norm": 2.523165464401245, - "learning_rate": 3.3594490517746774e-06, - "loss": 0.2267, - "step": 797 - }, - { - "epoch": 3.892682926829268, - "grad_norm": 2.7026922702789307, - "learning_rate": 3.3558503443351733e-06, - "loss": 0.2792, - "step": 798 - }, - { - "epoch": 3.897560975609756, - "grad_norm": 2.9232428073883057, - "learning_rate": 3.352249626924777e-06, - "loss": 0.2579, - "step": 799 - }, - { - "epoch": 3.902439024390244, - "grad_norm": 4.760788440704346, - "learning_rate": 3.348646907999801e-06, - "loss": 0.6983, - "step": 800 - }, - { - "epoch": 3.9073170731707316, - "grad_norm": 3.198249578475952, - "learning_rate": 3.345042196021257e-06, - "loss": 0.3265, - "step": 801 - }, - { - "epoch": 3.9121951219512194, - "grad_norm": 4.069286823272705, - "learning_rate": 3.3414354994548385e-06, - "loss": 0.497, - "step": 802 - }, - { - "epoch": 3.9170731707317072, - "grad_norm": 3.4435410499572754, - "learning_rate": 3.337826826770898e-06, - "loss": 0.2812, - "step": 803 - }, - { - "epoch": 3.921951219512195, - "grad_norm": 3.9805212020874023, - "learning_rate": 3.3342161864444312e-06, - "loss": 0.2277, - "step": 804 - }, - { - "epoch": 3.926829268292683, - "grad_norm": 3.348925828933716, - "learning_rate": 3.3306035869550534e-06, - "loss": 0.1614, - "step": 805 - }, - { - "epoch": 3.9317073170731707, - "grad_norm": 4.7613701820373535, - "learning_rate": 3.326989036786981e-06, - "loss": 0.3269, - "step": 806 - }, - { - "epoch": 3.9365853658536585, - "grad_norm": 3.807502508163452, - "learning_rate": 3.3233725444290126e-06, - "loss": 0.2619, - "step": 807 - }, - { - "epoch": 3.9414634146341463, - "grad_norm": 3.2690203189849854, - "learning_rate": 3.3197541183745065e-06, - "loss": 0.4334, - "step": 808 - }, - { - "epoch": 3.946341463414634, - "grad_norm": 3.396993398666382, - "learning_rate": 3.3161337671213634e-06, - "loss": 0.2738, - "step": 809 - }, - { - "epoch": 3.951219512195122, - "grad_norm": 3.086669921875, - "learning_rate": 3.312511499172006e-06, - "loss": 0.1597, - "step": 810 - }, - { - "epoch": 3.9560975609756097, - "grad_norm": 3.5688745975494385, - "learning_rate": 3.3088873230333562e-06, - "loss": 0.3195, - "step": 811 - }, - { - "epoch": 3.9609756097560975, - "grad_norm": 3.4843621253967285, - "learning_rate": 3.3052612472168193e-06, - "loss": 0.1865, - "step": 812 - }, - { - "epoch": 3.9658536585365853, - "grad_norm": 2.8479580879211426, - "learning_rate": 3.3016332802382618e-06, - "loss": 0.3108, - "step": 813 - }, - { - "epoch": 3.970731707317073, - "grad_norm": 3.3241543769836426, - "learning_rate": 3.2980034306179897e-06, - "loss": 0.2099, - "step": 814 - }, - { - "epoch": 3.975609756097561, - "grad_norm": 2.817675828933716, - "learning_rate": 3.294371706880733e-06, - "loss": 0.3073, - "step": 815 - }, - { - "epoch": 3.9804878048780488, - "grad_norm": 2.9535388946533203, - "learning_rate": 3.290738117555622e-06, - "loss": 0.2024, - "step": 816 - }, - { - "epoch": 3.9853658536585366, - "grad_norm": 5.021281719207764, - "learning_rate": 3.2871026711761666e-06, - "loss": 0.508, - "step": 817 - }, - { - "epoch": 3.9902439024390244, - "grad_norm": 3.3377649784088135, - "learning_rate": 3.2834653762802414e-06, - "loss": 0.2116, - "step": 818 - }, - { - "epoch": 3.995121951219512, - "grad_norm": 4.412073135375977, - "learning_rate": 3.2798262414100594e-06, - "loss": 0.2177, - "step": 819 - }, - { - "epoch": 4.0, - "grad_norm": 3.174323797225952, - "learning_rate": 3.2761852751121566e-06, - "loss": 0.1737, - "step": 820 - }, - { - "epoch": 4.004878048780488, - "grad_norm": 2.921494960784912, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2569, - "step": 821 - }, - { - "epoch": 4.009756097560976, - "grad_norm": 2.693495512008667, - "learning_rate": 3.2688978824408136e-06, - "loss": 0.1621, - "step": 822 - }, - { - "epoch": 4.014634146341463, - "grad_norm": 2.705796718597412, - "learning_rate": 3.2652514731818698e-06, - "loss": 0.1121, - "step": 823 - }, - { - "epoch": 4.019512195121951, - "grad_norm": 3.2621448040008545, - "learning_rate": 3.2616032667241564e-06, - "loss": 0.0835, - "step": 824 - }, - { - "epoch": 4.024390243902439, - "grad_norm": 3.6205084323883057, - "learning_rate": 3.257953271635513e-06, - "loss": 0.3731, - "step": 825 - }, - { - "epoch": 4.029268292682927, - "grad_norm": 3.2600371837615967, - "learning_rate": 3.2543014964879814e-06, - "loss": 0.1051, - "step": 826 - }, - { - "epoch": 4.034146341463415, - "grad_norm": 3.865178346633911, - "learning_rate": 3.250647949857781e-06, - "loss": 0.0916, - "step": 827 - }, - { - "epoch": 4.0390243902439025, - "grad_norm": 6.9700927734375, - "learning_rate": 3.2469926403252932e-06, - "loss": 0.4037, - "step": 828 - }, - { - "epoch": 4.04390243902439, - "grad_norm": 3.658712148666382, - "learning_rate": 3.2433355764750417e-06, - "loss": 0.0523, - "step": 829 - }, - { - "epoch": 4.048780487804878, - "grad_norm": 4.911301612854004, - "learning_rate": 3.2396767668956656e-06, - "loss": 0.2616, - "step": 830 - }, - { - "epoch": 4.053658536585366, - "grad_norm": 5.019360542297363, - "learning_rate": 3.2360162201799085e-06, - "loss": 0.195, - "step": 831 - }, - { - "epoch": 4.058536585365854, - "grad_norm": 3.493767261505127, - "learning_rate": 3.2323539449245906e-06, - "loss": 0.1245, - "step": 832 - }, - { - "epoch": 4.0634146341463415, - "grad_norm": 4.246248722076416, - "learning_rate": 3.2286899497305917e-06, - "loss": 0.1147, - "step": 833 - }, - { - "epoch": 4.068292682926829, - "grad_norm": 2.993704319000244, - "learning_rate": 3.2250242432028335e-06, - "loss": 0.2189, - "step": 834 - }, - { - "epoch": 4.073170731707317, - "grad_norm": 4.695023059844971, - "learning_rate": 3.221356833950254e-06, - "loss": 0.4685, - "step": 835 - }, - { - "epoch": 4.078048780487805, - "grad_norm": 2.777644634246826, - "learning_rate": 3.21768773058579e-06, - "loss": 0.1245, - "step": 836 - }, - { - "epoch": 4.082926829268293, - "grad_norm": 3.3545901775360107, - "learning_rate": 3.21401694172636e-06, - "loss": 0.1342, - "step": 837 - }, - { - "epoch": 4.087804878048781, - "grad_norm": 2.2222652435302734, - "learning_rate": 3.2103444759928383e-06, - "loss": 0.0484, - "step": 838 - }, - { - "epoch": 4.092682926829268, - "grad_norm": 2.580345630645752, - "learning_rate": 3.2066703420100377e-06, - "loss": 0.0592, - "step": 839 - }, - { - "epoch": 4.097560975609756, - "grad_norm": 3.8652923107147217, - "learning_rate": 3.2029945484066883e-06, - "loss": 0.2536, - "step": 840 - }, - { - "epoch": 4.102439024390244, - "grad_norm": 3.0441582202911377, - "learning_rate": 3.1993171038154203e-06, - "loss": 0.1221, - "step": 841 - }, - { - "epoch": 4.107317073170732, - "grad_norm": 2.2795114517211914, - "learning_rate": 3.1956380168727385e-06, - "loss": 0.1231, - "step": 842 - }, - { - "epoch": 4.11219512195122, - "grad_norm": 3.701009750366211, - "learning_rate": 3.191957296219007e-06, - "loss": 0.2144, - "step": 843 - }, - { - "epoch": 4.117073170731707, - "grad_norm": 3.452637195587158, - "learning_rate": 3.1882749504984247e-06, - "loss": 0.1026, - "step": 844 - }, - { - "epoch": 4.121951219512195, - "grad_norm": 2.4208810329437256, - "learning_rate": 3.1845909883590076e-06, - "loss": 0.1124, - "step": 845 - }, - { - "epoch": 4.126829268292683, - "grad_norm": 4.353063583374023, - "learning_rate": 3.180905418452569e-06, - "loss": 0.2804, - "step": 846 - }, - { - "epoch": 4.131707317073171, - "grad_norm": 3.1151084899902344, - "learning_rate": 3.1772182494346963e-06, - "loss": 0.1748, - "step": 847 - }, - { - "epoch": 4.136585365853659, - "grad_norm": 3.457940101623535, - "learning_rate": 3.1735294899647344e-06, - "loss": 0.1984, - "step": 848 - }, - { - "epoch": 4.1414634146341465, - "grad_norm": 3.3556935787200928, - "learning_rate": 3.169839148705762e-06, - "loss": 0.1332, - "step": 849 - }, - { - "epoch": 4.146341463414634, - "grad_norm": 3.5510823726654053, - "learning_rate": 3.1661472343245725e-06, - "loss": 0.4788, - "step": 850 - }, - { - "epoch": 4.151219512195122, - "grad_norm": 4.036712646484375, - "learning_rate": 3.162453755491655e-06, - "loss": 0.2437, - "step": 851 - }, - { - "epoch": 4.15609756097561, - "grad_norm": 4.417062282562256, - "learning_rate": 3.158758720881171e-06, - "loss": 0.203, - "step": 852 - }, - { - "epoch": 4.160975609756098, - "grad_norm": 3.920558214187622, - "learning_rate": 3.155062139170937e-06, - "loss": 0.1462, - "step": 853 - }, - { - "epoch": 4.1658536585365855, - "grad_norm": 6.472081661224365, - "learning_rate": 3.1513640190424034e-06, - "loss": 0.0972, - "step": 854 - }, - { - "epoch": 4.170731707317073, - "grad_norm": 3.975947141647339, - "learning_rate": 3.147664369180632e-06, - "loss": 0.1092, - "step": 855 - }, - { - "epoch": 4.175609756097561, - "grad_norm": 4.977376937866211, - "learning_rate": 3.143963198274278e-06, - "loss": 0.2215, - "step": 856 - }, - { - "epoch": 4.180487804878049, - "grad_norm": 3.595460891723633, - "learning_rate": 3.140260515015569e-06, - "loss": 0.1771, - "step": 857 - }, - { - "epoch": 4.185365853658537, - "grad_norm": 3.1085658073425293, - "learning_rate": 3.136556328100284e-06, - "loss": 0.1995, - "step": 858 - }, - { - "epoch": 4.190243902439025, - "grad_norm": 4.355626583099365, - "learning_rate": 3.132850646227734e-06, - "loss": 0.4048, - "step": 859 - }, - { - "epoch": 4.195121951219512, - "grad_norm": 3.8079614639282227, - "learning_rate": 3.12914347810074e-06, - "loss": 0.1914, - "step": 860 - }, - { - "epoch": 4.2, - "grad_norm": 3.725804328918457, - "learning_rate": 3.125434832425613e-06, - "loss": 0.1579, - "step": 861 - }, - { - "epoch": 4.204878048780488, - "grad_norm": 2.974649667739868, - "learning_rate": 3.121724717912138e-06, - "loss": 0.1814, - "step": 862 - }, - { - "epoch": 4.209756097560976, - "grad_norm": 3.6391279697418213, - "learning_rate": 3.118013143273542e-06, - "loss": 0.1481, - "step": 863 - }, - { - "epoch": 4.214634146341464, - "grad_norm": 3.216643810272217, - "learning_rate": 3.1143001172264893e-06, - "loss": 0.113, - "step": 864 - }, - { - "epoch": 4.219512195121951, - "grad_norm": 3.605855941772461, - "learning_rate": 3.1105856484910474e-06, - "loss": 0.1405, - "step": 865 - }, - { - "epoch": 4.224390243902439, - "grad_norm": 2.7186765670776367, - "learning_rate": 3.1068697457906736e-06, - "loss": 0.097, - "step": 866 - }, - { - "epoch": 4.229268292682927, - "grad_norm": 3.980973243713379, - "learning_rate": 3.1031524178521938e-06, - "loss": 0.2207, - "step": 867 - }, - { - "epoch": 4.234146341463415, - "grad_norm": 3.4623806476593018, - "learning_rate": 3.0994336734057804e-06, - "loss": 0.0552, - "step": 868 - }, - { - "epoch": 4.239024390243903, - "grad_norm": 3.7556748390197754, - "learning_rate": 3.0957135211849315e-06, - "loss": 0.1743, - "step": 869 - }, - { - "epoch": 4.2439024390243905, - "grad_norm": 3.3547914028167725, - "learning_rate": 3.0919919699264535e-06, - "loss": 0.1195, - "step": 870 - }, - { - "epoch": 4.248780487804878, - "grad_norm": 4.392014503479004, - "learning_rate": 3.0882690283704355e-06, - "loss": 0.6174, - "step": 871 - }, - { - "epoch": 4.253658536585366, - "grad_norm": 2.7031409740448, - "learning_rate": 3.084544705260234e-06, - "loss": 0.1359, - "step": 872 - }, - { - "epoch": 4.258536585365854, - "grad_norm": 2.3518481254577637, - "learning_rate": 3.080819009342451e-06, - "loss": 0.0786, - "step": 873 - }, - { - "epoch": 4.263414634146342, - "grad_norm": 2.636204481124878, - "learning_rate": 3.077091949366908e-06, - "loss": 0.0677, - "step": 874 - }, - { - "epoch": 4.2682926829268295, - "grad_norm": 2.8670942783355713, - "learning_rate": 3.073363534086636e-06, - "loss": 0.1084, - "step": 875 - }, - { - "epoch": 4.273170731707317, - "grad_norm": 2.7044737339019775, - "learning_rate": 3.0696337722578444e-06, - "loss": 0.0681, - "step": 876 - }, - { - "epoch": 4.278048780487805, - "grad_norm": 3.481539487838745, - "learning_rate": 3.0659026726399072e-06, - "loss": 0.2262, - "step": 877 - }, - { - "epoch": 4.282926829268293, - "grad_norm": 3.7746224403381348, - "learning_rate": 3.0621702439953393e-06, - "loss": 0.2169, - "step": 878 - }, - { - "epoch": 4.287804878048781, - "grad_norm": 3.6386263370513916, - "learning_rate": 3.0584364950897768e-06, - "loss": 0.0581, - "step": 879 - }, - { - "epoch": 4.2926829268292686, - "grad_norm": 3.389408588409424, - "learning_rate": 3.0547014346919574e-06, - "loss": 0.1687, - "step": 880 - }, - { - "epoch": 4.297560975609756, - "grad_norm": 3.6510157585144043, - "learning_rate": 3.0509650715736977e-06, - "loss": 0.1362, - "step": 881 - }, - { - "epoch": 4.302439024390244, - "grad_norm": 3.334210157394409, - "learning_rate": 3.0472274145098744e-06, - "loss": 0.1865, - "step": 882 - }, - { - "epoch": 4.307317073170732, - "grad_norm": 4.747341632843018, - "learning_rate": 3.0434884722784026e-06, - "loss": 0.2385, - "step": 883 - }, - { - "epoch": 4.31219512195122, - "grad_norm": 3.9266858100891113, - "learning_rate": 3.0397482536602168e-06, - "loss": 0.1004, - "step": 884 - }, - { - "epoch": 4.317073170731708, - "grad_norm": 2.984821081161499, - "learning_rate": 3.0360067674392475e-06, - "loss": 0.1469, - "step": 885 - }, - { - "epoch": 4.321951219512195, - "grad_norm": 2.6379380226135254, - "learning_rate": 3.0322640224024024e-06, - "loss": 0.0829, - "step": 886 - }, - { - "epoch": 4.326829268292683, - "grad_norm": 3.885495185852051, - "learning_rate": 3.0285200273395478e-06, - "loss": 0.2256, - "step": 887 - }, - { - "epoch": 4.331707317073171, - "grad_norm": 3.950394868850708, - "learning_rate": 3.024774791043481e-06, - "loss": 0.2402, - "step": 888 - }, - { - "epoch": 4.336585365853659, - "grad_norm": 4.147830963134766, - "learning_rate": 3.021028322309921e-06, - "loss": 0.2198, - "step": 889 - }, - { - "epoch": 4.341463414634147, - "grad_norm": 4.0821638107299805, - "learning_rate": 3.0172806299374734e-06, - "loss": 0.2304, - "step": 890 - }, - { - "epoch": 4.3463414634146345, - "grad_norm": 4.142312049865723, - "learning_rate": 3.0135317227276247e-06, - "loss": 0.2864, - "step": 891 - }, - { - "epoch": 4.351219512195122, - "grad_norm": 3.008504867553711, - "learning_rate": 3.0097816094847104e-06, - "loss": 0.2045, - "step": 892 - }, - { - "epoch": 4.35609756097561, - "grad_norm": 3.1674623489379883, - "learning_rate": 3.0060302990158984e-06, - "loss": 0.0864, - "step": 893 - }, - { - "epoch": 4.360975609756098, - "grad_norm": 3.3412492275238037, - "learning_rate": 3.002277800131171e-06, - "loss": 0.076, - "step": 894 - }, - { - "epoch": 4.365853658536586, - "grad_norm": 3.067330837249756, - "learning_rate": 2.998524121643298e-06, - "loss": 0.1724, - "step": 895 - }, - { - "epoch": 4.3707317073170735, - "grad_norm": 3.9015982151031494, - "learning_rate": 2.994769272367822e-06, - "loss": 0.2, - "step": 896 - }, - { - "epoch": 4.375609756097561, - "grad_norm": 3.0136911869049072, - "learning_rate": 2.991013261123035e-06, - "loss": 0.0852, - "step": 897 - }, - { - "epoch": 4.380487804878049, - "grad_norm": 3.6834237575531006, - "learning_rate": 2.9872560967299554e-06, - "loss": 0.1449, - "step": 898 - }, - { - "epoch": 4.385365853658537, - "grad_norm": 3.3486039638519287, - "learning_rate": 2.9834977880123132e-06, - "loss": 0.0659, - "step": 899 - }, - { - "epoch": 4.390243902439025, - "grad_norm": 2.971315622329712, - "learning_rate": 2.9797383437965243e-06, - "loss": 0.1114, - "step": 900 - }, - { - "epoch": 4.3951219512195125, - "grad_norm": 2.683359146118164, - "learning_rate": 2.975977772911671e-06, - "loss": 0.0822, - "step": 901 - }, - { - "epoch": 4.4, - "grad_norm": 2.9941935539245605, - "learning_rate": 2.972216084189482e-06, - "loss": 0.0858, - "step": 902 - }, - { - "epoch": 4.404878048780488, - "grad_norm": 2.4938626289367676, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.1162, - "step": 903 - }, - { - "epoch": 4.409756097560976, - "grad_norm": 2.9364712238311768, - "learning_rate": 2.964689388573118e-06, - "loss": 0.0821, - "step": 904 - }, - { - "epoch": 4.414634146341464, - "grad_norm": 3.3638134002685547, - "learning_rate": 2.9609243993554434e-06, - "loss": 0.25, - "step": 905 - }, - { - "epoch": 4.419512195121952, - "grad_norm": 3.657277822494507, - "learning_rate": 2.9571583276533923e-06, - "loss": 0.0852, - "step": 906 - }, - { - "epoch": 4.424390243902439, - "grad_norm": 5.486263275146484, - "learning_rate": 2.9533911823116124e-06, - "loss": 0.5123, - "step": 907 - }, - { - "epoch": 4.429268292682927, - "grad_norm": 5.194574356079102, - "learning_rate": 2.9496229721772734e-06, - "loss": 0.1854, - "step": 908 - }, - { - "epoch": 4.434146341463415, - "grad_norm": 3.520110845565796, - "learning_rate": 2.9458537061000435e-06, - "loss": 0.1785, - "step": 909 - }, - { - "epoch": 4.439024390243903, - "grad_norm": 3.417991876602173, - "learning_rate": 2.9420833929320726e-06, - "loss": 0.1603, - "step": 910 - }, - { - "epoch": 4.443902439024391, - "grad_norm": 5.225805282592773, - "learning_rate": 2.93831204152797e-06, - "loss": 0.3046, - "step": 911 - }, - { - "epoch": 4.4487804878048784, - "grad_norm": 3.541433572769165, - "learning_rate": 2.9345396607447807e-06, - "loss": 0.0631, - "step": 912 - }, - { - "epoch": 4.453658536585366, - "grad_norm": 3.909377098083496, - "learning_rate": 2.9307662594419704e-06, - "loss": 0.125, - "step": 913 - }, - { - "epoch": 4.458536585365854, - "grad_norm": 3.6604416370391846, - "learning_rate": 2.9269918464814e-06, - "loss": 0.156, - "step": 914 - }, - { - "epoch": 4.463414634146342, - "grad_norm": 3.7413833141326904, - "learning_rate": 2.923216430727306e-06, - "loss": 0.3334, - "step": 915 - }, - { - "epoch": 4.46829268292683, - "grad_norm": 3.531996011734009, - "learning_rate": 2.9194400210462808e-06, - "loss": 0.2534, - "step": 916 - }, - { - "epoch": 4.473170731707317, - "grad_norm": 4.163621425628662, - "learning_rate": 2.91566262630725e-06, - "loss": 0.352, - "step": 917 - }, - { - "epoch": 4.478048780487805, - "grad_norm": 3.923635482788086, - "learning_rate": 2.9118842553814526e-06, - "loss": 0.1132, - "step": 918 - }, - { - "epoch": 4.482926829268292, - "grad_norm": 2.833768844604492, - "learning_rate": 2.9081049171424223e-06, - "loss": 0.086, - "step": 919 - }, - { - "epoch": 4.487804878048781, - "grad_norm": 2.9006292819976807, - "learning_rate": 2.9043246204659624e-06, - "loss": 0.0693, - "step": 920 - }, - { - "epoch": 4.492682926829268, - "grad_norm": 3.699376344680786, - "learning_rate": 2.9005433742301274e-06, - "loss": 0.2463, - "step": 921 - }, - { - "epoch": 4.4975609756097565, - "grad_norm": 4.882141590118408, - "learning_rate": 2.8967611873152037e-06, - "loss": 0.2275, - "step": 922 - }, - { - "epoch": 4.5024390243902435, - "grad_norm": 3.0554678440093994, - "learning_rate": 2.892978068603683e-06, - "loss": 0.0752, - "step": 923 - }, - { - "epoch": 4.507317073170732, - "grad_norm": 3.1225268840789795, - "learning_rate": 2.889194026980249e-06, - "loss": 0.1649, - "step": 924 - }, - { - "epoch": 4.512195121951219, - "grad_norm": 17.75234031677246, - "learning_rate": 2.8854090713317514e-06, - "loss": 0.0437, - "step": 925 - }, - { - "epoch": 4.517073170731708, - "grad_norm": 3.011223554611206, - "learning_rate": 2.8816232105471864e-06, - "loss": 0.0747, - "step": 926 - }, - { - "epoch": 4.521951219512195, - "grad_norm": 4.327573299407959, - "learning_rate": 2.877836453517677e-06, - "loss": 0.3884, - "step": 927 - }, - { - "epoch": 4.526829268292683, - "grad_norm": 3.8694965839385986, - "learning_rate": 2.8740488091364492e-06, - "loss": 0.2741, - "step": 928 - }, - { - "epoch": 4.53170731707317, - "grad_norm": 5.375877380371094, - "learning_rate": 2.870260286298814e-06, - "loss": 0.364, - "step": 929 - }, - { - "epoch": 4.536585365853659, - "grad_norm": 3.380891799926758, - "learning_rate": 2.866470893902147e-06, - "loss": 0.1495, - "step": 930 - }, - { - "epoch": 4.541463414634146, - "grad_norm": 3.723992109298706, - "learning_rate": 2.8626806408458626e-06, - "loss": 0.1403, - "step": 931 - }, - { - "epoch": 4.546341463414635, - "grad_norm": 3.0534417629241943, - "learning_rate": 2.8588895360313983e-06, - "loss": 0.0946, - "step": 932 - }, - { - "epoch": 4.5512195121951216, - "grad_norm": 2.8875234127044678, - "learning_rate": 2.8550975883621935e-06, - "loss": 0.1851, - "step": 933 - }, - { - "epoch": 4.55609756097561, - "grad_norm": 3.532166004180908, - "learning_rate": 2.8513048067436644e-06, - "loss": 0.178, - "step": 934 - }, - { - "epoch": 4.560975609756097, - "grad_norm": 2.942798376083374, - "learning_rate": 2.847511200083187e-06, - "loss": 0.1131, - "step": 935 - }, - { - "epoch": 4.565853658536585, - "grad_norm": 2.926874876022339, - "learning_rate": 2.843716777290074e-06, - "loss": 0.1251, - "step": 936 - }, - { - "epoch": 4.570731707317073, - "grad_norm": 3.525895357131958, - "learning_rate": 2.839921547275556e-06, - "loss": 0.0946, - "step": 937 - }, - { - "epoch": 4.575609756097561, - "grad_norm": 3.7033681869506836, - "learning_rate": 2.836125518952759e-06, - "loss": 0.1529, - "step": 938 - }, - { - "epoch": 4.580487804878048, - "grad_norm": 3.235154867172241, - "learning_rate": 2.8323287012366845e-06, - "loss": 0.2511, - "step": 939 - }, - { - "epoch": 4.585365853658536, - "grad_norm": 3.5275583267211914, - "learning_rate": 2.828531103044186e-06, - "loss": 0.1474, - "step": 940 - }, - { - "epoch": 4.590243902439024, - "grad_norm": 3.1356353759765625, - "learning_rate": 2.8247327332939512e-06, - "loss": 0.2249, - "step": 941 - }, - { - "epoch": 4.595121951219512, - "grad_norm": 3.789210081100464, - "learning_rate": 2.82093360090648e-06, - "loss": 0.2258, - "step": 942 - }, - { - "epoch": 4.6, - "grad_norm": 4.841623306274414, - "learning_rate": 2.8171337148040636e-06, - "loss": 0.2235, - "step": 943 - }, - { - "epoch": 4.6048780487804875, - "grad_norm": 3.161630630493164, - "learning_rate": 2.813333083910761e-06, - "loss": 0.1562, - "step": 944 - }, - { - "epoch": 4.609756097560975, - "grad_norm": 2.8718132972717285, - "learning_rate": 2.8095317171523835e-06, - "loss": 0.0625, - "step": 945 - }, - { - "epoch": 4.614634146341463, - "grad_norm": 3.6432454586029053, - "learning_rate": 2.805729623456469e-06, - "loss": 0.2205, - "step": 946 - }, - { - "epoch": 4.619512195121951, - "grad_norm": 4.382034778594971, - "learning_rate": 2.8019268117522624e-06, - "loss": 0.3241, - "step": 947 - }, - { - "epoch": 4.624390243902439, - "grad_norm": 3.2998175621032715, - "learning_rate": 2.798123290970695e-06, - "loss": 0.1983, - "step": 948 - }, - { - "epoch": 4.6292682926829265, - "grad_norm": 3.8665990829467773, - "learning_rate": 2.794319070044365e-06, - "loss": 0.3391, - "step": 949 - }, - { - "epoch": 4.634146341463414, - "grad_norm": 3.628403425216675, - "learning_rate": 2.790514157907512e-06, - "loss": 0.1329, - "step": 950 - }, - { - "epoch": 4.639024390243902, - "grad_norm": 2.8889615535736084, - "learning_rate": 2.786708563496002e-06, - "loss": 0.141, - "step": 951 - }, - { - "epoch": 4.64390243902439, - "grad_norm": 4.07351541519165, - "learning_rate": 2.782902295747299e-06, - "loss": 0.2935, - "step": 952 - }, - { - "epoch": 4.648780487804878, - "grad_norm": 4.220067024230957, - "learning_rate": 2.7790953636004536e-06, - "loss": 0.318, - "step": 953 - }, - { - "epoch": 4.6536585365853655, - "grad_norm": 3.8444325923919678, - "learning_rate": 2.775287775996074e-06, - "loss": 0.3388, - "step": 954 - }, - { - "epoch": 4.658536585365853, - "grad_norm": 3.197313070297241, - "learning_rate": 2.7714795418763067e-06, - "loss": 0.0925, - "step": 955 - }, - { - "epoch": 4.663414634146341, - "grad_norm": 4.0050811767578125, - "learning_rate": 2.7676706701848187e-06, - "loss": 0.2811, - "step": 956 - }, - { - "epoch": 4.668292682926829, - "grad_norm": 3.217160224914551, - "learning_rate": 2.763861169866774e-06, - "loss": 0.311, - "step": 957 - }, - { - "epoch": 4.673170731707317, - "grad_norm": 2.9892494678497314, - "learning_rate": 2.7600510498688104e-06, - "loss": 0.0582, - "step": 958 - }, - { - "epoch": 4.678048780487805, - "grad_norm": 3.954805374145508, - "learning_rate": 2.7562403191390246e-06, - "loss": 0.1238, - "step": 959 - }, - { - "epoch": 4.682926829268292, - "grad_norm": 2.9582695960998535, - "learning_rate": 2.7524289866269467e-06, - "loss": 0.1243, - "step": 960 - }, - { - "epoch": 4.68780487804878, - "grad_norm": 2.807002544403076, - "learning_rate": 2.748617061283518e-06, - "loss": 0.1388, - "step": 961 - }, - { - "epoch": 4.692682926829268, - "grad_norm": 3.980499505996704, - "learning_rate": 2.744804552061074e-06, - "loss": 0.1144, - "step": 962 - }, - { - "epoch": 4.697560975609756, - "grad_norm": 3.6389007568359375, - "learning_rate": 2.740991467913321e-06, - "loss": 0.2155, - "step": 963 - }, - { - "epoch": 4.702439024390244, - "grad_norm": 3.0950801372528076, - "learning_rate": 2.737177817795315e-06, - "loss": 0.0983, - "step": 964 - }, - { - "epoch": 4.7073170731707314, - "grad_norm": 3.1723053455352783, - "learning_rate": 2.7333636106634414e-06, - "loss": 0.1365, - "step": 965 - }, - { - "epoch": 4.712195121951219, - "grad_norm": 3.83921217918396, - "learning_rate": 2.7295488554753957e-06, - "loss": 0.1977, - "step": 966 - }, - { - "epoch": 4.717073170731707, - "grad_norm": 3.348057746887207, - "learning_rate": 2.725733561190157e-06, - "loss": 0.1311, - "step": 967 - }, - { - "epoch": 4.721951219512195, - "grad_norm": 3.828483819961548, - "learning_rate": 2.721917736767973e-06, - "loss": 0.2464, - "step": 968 - }, - { - "epoch": 4.726829268292683, - "grad_norm": 2.6004624366760254, - "learning_rate": 2.7181013911703357e-06, - "loss": 0.1088, - "step": 969 - }, - { - "epoch": 4.7317073170731705, - "grad_norm": 3.316990852355957, - "learning_rate": 2.714284533359961e-06, - "loss": 0.1492, - "step": 970 - }, - { - "epoch": 4.736585365853658, - "grad_norm": 3.8770010471343994, - "learning_rate": 2.710467172300768e-06, - "loss": 0.218, - "step": 971 - }, - { - "epoch": 4.741463414634146, - "grad_norm": 4.456376552581787, - "learning_rate": 2.706649316957857e-06, - "loss": 0.2199, - "step": 972 - }, - { - "epoch": 4.746341463414634, - "grad_norm": 3.3376309871673584, - "learning_rate": 2.7028309762974897e-06, - "loss": 0.0595, - "step": 973 - }, - { - "epoch": 4.751219512195122, - "grad_norm": 3.6755495071411133, - "learning_rate": 2.699012159287069e-06, - "loss": 0.1653, - "step": 974 - }, - { - "epoch": 4.7560975609756095, - "grad_norm": 2.939887046813965, - "learning_rate": 2.6951928748951125e-06, - "loss": 0.0681, - "step": 975 - }, - { - "epoch": 4.760975609756097, - "grad_norm": 3.4101195335388184, - "learning_rate": 2.69137313209124e-06, - "loss": 0.2046, - "step": 976 - }, - { - "epoch": 4.765853658536585, - "grad_norm": 3.9811208248138428, - "learning_rate": 2.687552939846145e-06, - "loss": 0.2255, - "step": 977 - }, - { - "epoch": 4.770731707317073, - "grad_norm": 3.484255313873291, - "learning_rate": 2.6837323071315766e-06, - "loss": 0.0512, - "step": 978 - }, - { - "epoch": 4.775609756097561, - "grad_norm": 3.9005143642425537, - "learning_rate": 2.679911242920321e-06, - "loss": 0.162, - "step": 979 - }, - { - "epoch": 4.780487804878049, - "grad_norm": 4.933374881744385, - "learning_rate": 2.6760897561861742e-06, - "loss": 0.398, - "step": 980 - }, - { - "epoch": 4.785365853658536, - "grad_norm": 3.0741539001464844, - "learning_rate": 2.672267855903927e-06, - "loss": 0.0507, - "step": 981 - }, - { - "epoch": 4.790243902439024, - "grad_norm": 3.023772716522217, - "learning_rate": 2.6684455510493413e-06, - "loss": 0.2066, - "step": 982 - }, - { - "epoch": 4.795121951219512, - "grad_norm": 3.0102407932281494, - "learning_rate": 2.6646228505991267e-06, - "loss": 0.2296, - "step": 983 - }, - { - "epoch": 4.8, - "grad_norm": 3.902200222015381, - "learning_rate": 2.6607997635309246e-06, - "loss": 0.14, - "step": 984 - }, - { - "epoch": 4.804878048780488, - "grad_norm": 3.836185932159424, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.1583, - "step": 985 - }, - { - "epoch": 4.809756097560975, - "grad_norm": 3.539628744125366, - "learning_rate": 2.653152465455639e-06, - "loss": 0.2619, - "step": 986 - }, - { - "epoch": 4.814634146341463, - "grad_norm": 4.716914653778076, - "learning_rate": 2.6493282724082913e-06, - "loss": 0.3029, - "step": 987 - }, - { - "epoch": 4.819512195121951, - "grad_norm": 3.466914176940918, - "learning_rate": 2.6455037286623864e-06, - "loss": 0.095, - "step": 988 - }, - { - "epoch": 4.824390243902439, - "grad_norm": 2.1798667907714844, - "learning_rate": 2.6416788431998935e-06, - "loss": 0.1232, - "step": 989 - }, - { - "epoch": 4.829268292682927, - "grad_norm": 3.309039354324341, - "learning_rate": 2.637853625003585e-06, - "loss": 0.3671, - "step": 990 - }, - { - "epoch": 4.8341463414634145, - "grad_norm": 3.2619435787200928, - "learning_rate": 2.6340280830570142e-06, - "loss": 0.194, - "step": 991 - }, - { - "epoch": 4.839024390243902, - "grad_norm": 3.601161003112793, - "learning_rate": 2.6302022263444947e-06, - "loss": 0.1214, - "step": 992 - }, - { - "epoch": 4.84390243902439, - "grad_norm": 4.13787841796875, - "learning_rate": 2.6263760638510793e-06, - "loss": 0.311, - "step": 993 - }, - { - "epoch": 4.848780487804878, - "grad_norm": 3.0474166870117188, - "learning_rate": 2.6225496045625394e-06, - "loss": 0.1853, - "step": 994 - }, - { - "epoch": 4.853658536585366, - "grad_norm": 4.481237411499023, - "learning_rate": 2.6187228574653428e-06, - "loss": 0.2088, - "step": 995 - }, - { - "epoch": 4.8585365853658535, - "grad_norm": 3.235966444015503, - "learning_rate": 2.614895831546633e-06, - "loss": 0.1439, - "step": 996 - }, - { - "epoch": 4.863414634146341, - "grad_norm": 4.103270053863525, - "learning_rate": 2.6110685357942096e-06, - "loss": 0.2823, - "step": 997 - }, - { - "epoch": 4.868292682926829, - "grad_norm": 4.134536266326904, - "learning_rate": 2.6072409791965048e-06, - "loss": 0.2963, - "step": 998 - }, - { - "epoch": 4.873170731707317, - "grad_norm": 4.124892711639404, - "learning_rate": 2.6034131707425638e-06, - "loss": 0.4127, - "step": 999 - }, - { - "epoch": 4.878048780487805, - "grad_norm": 3.565139055252075, - "learning_rate": 2.5995851194220223e-06, - "loss": 0.1601, - "step": 1000 - }, - { - "epoch": 4.882926829268293, - "grad_norm": 2.7548017501831055, - "learning_rate": 2.595756834225089e-06, - "loss": 0.161, - "step": 1001 - }, - { - "epoch": 4.88780487804878, - "grad_norm": 3.9297611713409424, - "learning_rate": 2.5919283241425188e-06, - "loss": 0.1013, - "step": 1002 - }, - { - "epoch": 4.892682926829268, - "grad_norm": 2.4904236793518066, - "learning_rate": 2.5880995981655965e-06, - "loss": 0.1177, - "step": 1003 - }, - { - "epoch": 4.897560975609756, - "grad_norm": 3.513308048248291, - "learning_rate": 2.584270665286113e-06, - "loss": 0.0682, - "step": 1004 - }, - { - "epoch": 4.902439024390244, - "grad_norm": 4.221067428588867, - "learning_rate": 2.580441534496346e-06, - "loss": 0.1502, - "step": 1005 - }, - { - "epoch": 4.907317073170732, - "grad_norm": 3.4298903942108154, - "learning_rate": 2.576612214789039e-06, - "loss": 0.1772, - "step": 1006 - }, - { - "epoch": 4.912195121951219, - "grad_norm": 4.402887344360352, - "learning_rate": 2.5727827151573747e-06, - "loss": 0.2029, - "step": 1007 - }, - { - "epoch": 4.917073170731707, - "grad_norm": 4.194999694824219, - "learning_rate": 2.568953044594964e-06, - "loss": 0.1269, - "step": 1008 - }, - { - "epoch": 4.921951219512195, - "grad_norm": 3.657607078552246, - "learning_rate": 2.5651232120958157e-06, - "loss": 0.1311, - "step": 1009 - }, - { - "epoch": 4.926829268292683, - "grad_norm": 4.092184543609619, - "learning_rate": 2.56129322665432e-06, - "loss": 0.1085, - "step": 1010 - }, - { - "epoch": 4.931707317073171, - "grad_norm": 3.3648242950439453, - "learning_rate": 2.5574630972652263e-06, - "loss": 0.0782, - "step": 1011 - }, - { - "epoch": 4.9365853658536585, - "grad_norm": 3.7215166091918945, - "learning_rate": 2.553632832923622e-06, - "loss": 0.1391, - "step": 1012 - }, - { - "epoch": 4.941463414634146, - "grad_norm": 4.045740127563477, - "learning_rate": 2.5498024426249107e-06, - "loss": 0.3141, - "step": 1013 - }, - { - "epoch": 4.946341463414634, - "grad_norm": 3.2363107204437256, - "learning_rate": 2.545971935364794e-06, - "loss": 0.0679, - "step": 1014 - }, - { - "epoch": 4.951219512195122, - "grad_norm": 3.057283639907837, - "learning_rate": 2.5421413201392443e-06, - "loss": 0.1382, - "step": 1015 - }, - { - "epoch": 4.95609756097561, - "grad_norm": 3.591535806655884, - "learning_rate": 2.538310605944491e-06, - "loss": 0.112, - "step": 1016 - }, - { - "epoch": 4.9609756097560975, - "grad_norm": 3.1629281044006348, - "learning_rate": 2.534479801776996e-06, - "loss": 0.1261, - "step": 1017 - }, - { - "epoch": 4.965853658536585, - "grad_norm": 2.691740036010742, - "learning_rate": 2.53064891663343e-06, - "loss": 0.2328, - "step": 1018 - }, - { - "epoch": 4.970731707317073, - "grad_norm": 3.2620503902435303, - "learning_rate": 2.526817959510655e-06, - "loss": 0.193, - "step": 1019 - }, - { - "epoch": 4.975609756097561, - "grad_norm": 3.0721535682678223, - "learning_rate": 2.5229869394057038e-06, - "loss": 0.2444, - "step": 1020 - }, - { - "epoch": 4.980487804878049, - "grad_norm": 2.6279208660125732, - "learning_rate": 2.5191558653157542e-06, - "loss": 0.1103, - "step": 1021 - }, - { - "epoch": 4.985365853658537, - "grad_norm": 2.9295670986175537, - "learning_rate": 2.515324746238113e-06, - "loss": 0.0553, - "step": 1022 - }, - { - "epoch": 4.990243902439024, - "grad_norm": 3.3960084915161133, - "learning_rate": 2.511493591170191e-06, - "loss": 0.1686, - "step": 1023 - }, - { - "epoch": 4.995121951219512, - "grad_norm": 4.138705253601074, - "learning_rate": 2.5076624091094846e-06, - "loss": 0.1208, - "step": 1024 - }, - { - "epoch": 5.0, - "grad_norm": 2.603870391845703, - "learning_rate": 2.503831209053554e-06, - "loss": 0.1216, - "step": 1025 - }, - { - "epoch": 5.004878048780488, - "grad_norm": 2.525205612182617, - "learning_rate": 2.5e-06, - "loss": 0.0984, - "step": 1026 - }, - { - "epoch": 5.009756097560976, - "grad_norm": 3.2502501010894775, - "learning_rate": 2.4961687909464462e-06, - "loss": 0.1323, - "step": 1027 - }, - { - "epoch": 5.014634146341463, - "grad_norm": 5.363409519195557, - "learning_rate": 2.492337590890516e-06, - "loss": 0.3516, - "step": 1028 - }, - { - "epoch": 5.019512195121951, - "grad_norm": 2.887723445892334, - "learning_rate": 2.4885064088298097e-06, - "loss": 0.1931, - "step": 1029 - }, - { - "epoch": 5.024390243902439, - "grad_norm": 3.4529435634613037, - "learning_rate": 2.4846752537618875e-06, - "loss": 0.0675, - "step": 1030 - }, - { - "epoch": 5.029268292682927, - "grad_norm": 4.202361106872559, - "learning_rate": 2.480844134684246e-06, - "loss": 0.1643, - "step": 1031 - }, - { - "epoch": 5.034146341463415, - "grad_norm": 2.910275459289551, - "learning_rate": 2.4770130605942966e-06, - "loss": 0.11, - "step": 1032 - }, - { - "epoch": 5.0390243902439025, - "grad_norm": 3.5430362224578857, - "learning_rate": 2.4731820404893457e-06, - "loss": 0.0614, - "step": 1033 - }, - { - "epoch": 5.04390243902439, - "grad_norm": 4.501879692077637, - "learning_rate": 2.469351083366571e-06, - "loss": 0.0954, - "step": 1034 - }, - { - "epoch": 5.048780487804878, - "grad_norm": 2.732261896133423, - "learning_rate": 2.4655201982230044e-06, - "loss": 0.0275, - "step": 1035 - }, - { - "epoch": 5.053658536585366, - "grad_norm": 3.5926437377929688, - "learning_rate": 2.4616893940555094e-06, - "loss": 0.0661, - "step": 1036 - }, - { - "epoch": 5.058536585365854, - "grad_norm": 4.790312767028809, - "learning_rate": 2.457858679860757e-06, - "loss": 0.2976, - "step": 1037 - }, - { - "epoch": 5.0634146341463415, - "grad_norm": 4.453246116638184, - "learning_rate": 2.4540280646352072e-06, - "loss": 0.1216, - "step": 1038 - }, - { - "epoch": 5.068292682926829, - "grad_norm": 3.288011074066162, - "learning_rate": 2.45019755737509e-06, - "loss": 0.0877, - "step": 1039 - }, - { - "epoch": 5.073170731707317, - "grad_norm": 3.566927671432495, - "learning_rate": 2.4463671670763787e-06, - "loss": 0.1661, - "step": 1040 - }, - { - "epoch": 5.078048780487805, - "grad_norm": 3.250047206878662, - "learning_rate": 2.4425369027347746e-06, - "loss": 0.211, - "step": 1041 - }, - { - "epoch": 5.082926829268293, - "grad_norm": 3.0214977264404297, - "learning_rate": 2.4387067733456804e-06, - "loss": 0.093, - "step": 1042 - }, - { - "epoch": 5.087804878048781, - "grad_norm": 3.8162097930908203, - "learning_rate": 2.4348767879041847e-06, - "loss": 0.0777, - "step": 1043 - }, - { - "epoch": 5.092682926829268, - "grad_norm": 3.8071560859680176, - "learning_rate": 2.4310469554050366e-06, - "loss": 0.087, - "step": 1044 - }, - { - "epoch": 5.097560975609756, - "grad_norm": 3.1032073497772217, - "learning_rate": 2.4272172848426257e-06, - "loss": 0.1105, - "step": 1045 - }, - { - "epoch": 5.102439024390244, - "grad_norm": 2.8980185985565186, - "learning_rate": 2.423387785210962e-06, - "loss": 0.0704, - "step": 1046 - }, - { - "epoch": 5.107317073170732, - "grad_norm": 3.9110755920410156, - "learning_rate": 2.4195584655036544e-06, - "loss": 0.2118, - "step": 1047 - }, - { - "epoch": 5.11219512195122, - "grad_norm": 2.678884506225586, - "learning_rate": 2.4157293347138877e-06, - "loss": 0.0664, - "step": 1048 - }, - { - "epoch": 5.117073170731707, - "grad_norm": 3.183046340942383, - "learning_rate": 2.4119004018344043e-06, - "loss": 0.1767, - "step": 1049 - }, - { - "epoch": 5.121951219512195, - "grad_norm": 3.9198925495147705, - "learning_rate": 2.408071675857482e-06, - "loss": 0.1288, - "step": 1050 - }, - { - "epoch": 5.126829268292683, - "grad_norm": 4.378621578216553, - "learning_rate": 2.404243165774912e-06, - "loss": 0.1724, - "step": 1051 - }, - { - "epoch": 5.131707317073171, - "grad_norm": 2.5509133338928223, - "learning_rate": 2.4004148805779785e-06, - "loss": 0.0382, - "step": 1052 - }, - { - "epoch": 5.136585365853659, - "grad_norm": 3.692396402359009, - "learning_rate": 2.3965868292574375e-06, - "loss": 0.0942, - "step": 1053 - }, - { - "epoch": 5.1414634146341465, - "grad_norm": 3.8537800312042236, - "learning_rate": 2.392759020803496e-06, - "loss": 0.0819, - "step": 1054 - }, - { - "epoch": 5.146341463414634, - "grad_norm": 4.02876091003418, - "learning_rate": 2.3889314642057916e-06, - "loss": 0.0866, - "step": 1055 - }, - { - "epoch": 5.151219512195122, - "grad_norm": 3.531857490539551, - "learning_rate": 2.3851041684533677e-06, - "loss": 0.1557, - "step": 1056 - }, - { - "epoch": 5.15609756097561, - "grad_norm": 2.231265068054199, - "learning_rate": 2.381277142534658e-06, - "loss": 0.0421, - "step": 1057 - }, - { - "epoch": 5.160975609756098, - "grad_norm": 3.159226894378662, - "learning_rate": 2.3774503954374614e-06, - "loss": 0.0395, - "step": 1058 - }, - { - "epoch": 5.1658536585365855, - "grad_norm": 3.0375123023986816, - "learning_rate": 2.373623936148921e-06, - "loss": 0.1869, - "step": 1059 - }, - { - "epoch": 5.170731707317073, - "grad_norm": 5.4905900955200195, - "learning_rate": 2.369797773655506e-06, - "loss": 0.1426, - "step": 1060 - }, - { - "epoch": 5.175609756097561, - "grad_norm": 2.8739638328552246, - "learning_rate": 2.3659719169429866e-06, - "loss": 0.0788, - "step": 1061 - }, - { - "epoch": 5.180487804878049, - "grad_norm": 2.612183094024658, - "learning_rate": 2.3621463749964153e-06, - "loss": 0.0449, - "step": 1062 - }, - { - "epoch": 5.185365853658537, - "grad_norm": 2.0573198795318604, - "learning_rate": 2.3583211568001073e-06, - "loss": 0.0264, - "step": 1063 - }, - { - "epoch": 5.190243902439025, - "grad_norm": 2.3667244911193848, - "learning_rate": 2.3544962713376144e-06, - "loss": 0.0507, - "step": 1064 - }, - { - "epoch": 5.195121951219512, - "grad_norm": 2.1223740577697754, - "learning_rate": 2.3506717275917095e-06, - "loss": 0.0576, - "step": 1065 - }, - { - "epoch": 5.2, - "grad_norm": 2.2630319595336914, - "learning_rate": 2.346847534544362e-06, - "loss": 0.0523, - "step": 1066 - }, - { - "epoch": 5.204878048780488, - "grad_norm": 3.201913595199585, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.0847, - "step": 1067 - }, - { - "epoch": 5.209756097560976, - "grad_norm": 2.2149481773376465, - "learning_rate": 2.3392002364690762e-06, - "loss": 0.0215, - "step": 1068 - }, - { - "epoch": 5.214634146341464, - "grad_norm": 4.425244331359863, - "learning_rate": 2.335377149400874e-06, - "loss": 0.1018, - "step": 1069 - }, - { - "epoch": 5.219512195121951, - "grad_norm": 4.548358917236328, - "learning_rate": 2.3315544489506596e-06, - "loss": 0.1485, - "step": 1070 - }, - { - "epoch": 5.224390243902439, - "grad_norm": 3.635796546936035, - "learning_rate": 2.3277321440960733e-06, - "loss": 0.111, - "step": 1071 - }, - { - "epoch": 5.229268292682927, - "grad_norm": 2.3180043697357178, - "learning_rate": 2.323910243813826e-06, - "loss": 0.0267, - "step": 1072 - }, - { - "epoch": 5.234146341463415, - "grad_norm": 3.675490379333496, - "learning_rate": 2.3200887570796798e-06, - "loss": 0.153, - "step": 1073 - }, - { - "epoch": 5.239024390243903, - "grad_norm": 2.883225202560425, - "learning_rate": 2.316267692868424e-06, - "loss": 0.0968, - "step": 1074 - }, - { - "epoch": 5.2439024390243905, - "grad_norm": 3.0320188999176025, - "learning_rate": 2.312447060153856e-06, - "loss": 0.0786, - "step": 1075 - }, - { - "epoch": 5.248780487804878, - "grad_norm": 2.682695150375366, - "learning_rate": 2.308626867908761e-06, - "loss": 0.0677, - "step": 1076 - }, - { - "epoch": 5.253658536585366, - "grad_norm": 3.941967010498047, - "learning_rate": 2.3048071251048884e-06, - "loss": 0.1059, - "step": 1077 - }, - { - "epoch": 5.258536585365854, - "grad_norm": 6.485599517822266, - "learning_rate": 2.300987840712932e-06, - "loss": 0.1331, - "step": 1078 - }, - { - "epoch": 5.263414634146342, - "grad_norm": 3.809269905090332, - "learning_rate": 2.297169023702511e-06, - "loss": 0.169, - "step": 1079 - }, - { - "epoch": 5.2682926829268295, - "grad_norm": 3.115626573562622, - "learning_rate": 2.2933506830421436e-06, - "loss": 0.1349, - "step": 1080 - }, - { - "epoch": 5.273170731707317, - "grad_norm": 2.2234909534454346, - "learning_rate": 2.2895328276992325e-06, - "loss": 0.0191, - "step": 1081 - }, - { - "epoch": 5.278048780487805, - "grad_norm": 3.896925926208496, - "learning_rate": 2.28571546664004e-06, - "loss": 0.1961, - "step": 1082 - }, - { - "epoch": 5.282926829268293, - "grad_norm": 2.4134509563446045, - "learning_rate": 2.281898608829665e-06, - "loss": 0.02, - "step": 1083 - }, - { - "epoch": 5.287804878048781, - "grad_norm": 2.7599191665649414, - "learning_rate": 2.2780822632320273e-06, - "loss": 0.0763, - "step": 1084 - }, - { - "epoch": 5.2926829268292686, - "grad_norm": 2.465637683868408, - "learning_rate": 2.2742664388098435e-06, - "loss": 0.0403, - "step": 1085 - }, - { - "epoch": 5.297560975609756, - "grad_norm": 2.4026618003845215, - "learning_rate": 2.270451144524605e-06, - "loss": 0.0982, - "step": 1086 - }, - { - "epoch": 5.302439024390244, - "grad_norm": 3.3339459896087646, - "learning_rate": 2.266636389336559e-06, - "loss": 0.09, - "step": 1087 - }, - { - "epoch": 5.307317073170732, - "grad_norm": 2.113255023956299, - "learning_rate": 2.262822182204686e-06, - "loss": 0.0267, - "step": 1088 - }, - { - "epoch": 5.31219512195122, - "grad_norm": 3.1760852336883545, - "learning_rate": 2.2590085320866798e-06, - "loss": 0.0295, - "step": 1089 - }, - { - "epoch": 5.317073170731708, - "grad_norm": 2.9674434661865234, - "learning_rate": 2.255195447938927e-06, - "loss": 0.0261, - "step": 1090 - }, - { - "epoch": 5.321951219512195, - "grad_norm": 3.4384074211120605, - "learning_rate": 2.251382938716482e-06, - "loss": 0.0936, - "step": 1091 - }, - { - "epoch": 5.326829268292683, - "grad_norm": 3.3814568519592285, - "learning_rate": 2.2475710133730533e-06, - "loss": 0.0426, - "step": 1092 - }, - { - "epoch": 5.331707317073171, - "grad_norm": 3.081317663192749, - "learning_rate": 2.243759680860975e-06, - "loss": 0.0799, - "step": 1093 - }, - { - "epoch": 5.336585365853659, - "grad_norm": 3.5608482360839844, - "learning_rate": 2.2399489501311896e-06, - "loss": 0.0906, - "step": 1094 - }, - { - "epoch": 5.341463414634147, - "grad_norm": 3.7886314392089844, - "learning_rate": 2.2361388301332265e-06, - "loss": 0.2152, - "step": 1095 - }, - { - "epoch": 5.3463414634146345, - "grad_norm": 1.9531102180480957, - "learning_rate": 2.2323293298151817e-06, - "loss": 0.0359, - "step": 1096 - }, - { - "epoch": 5.351219512195122, - "grad_norm": 2.2828023433685303, - "learning_rate": 2.2285204581236937e-06, - "loss": 0.0368, - "step": 1097 - }, - { - "epoch": 5.35609756097561, - "grad_norm": 3.110262870788574, - "learning_rate": 2.2247122240039268e-06, - "loss": 0.0426, - "step": 1098 - }, - { - "epoch": 5.360975609756098, - "grad_norm": 2.3293566703796387, - "learning_rate": 2.2209046363995464e-06, - "loss": 0.0223, - "step": 1099 - }, - { - "epoch": 5.365853658536586, - "grad_norm": 2.990884780883789, - "learning_rate": 2.217097704252701e-06, - "loss": 0.1276, - "step": 1100 - }, - { - "epoch": 5.3707317073170735, - "grad_norm": 2.568014144897461, - "learning_rate": 2.2132914365039993e-06, - "loss": 0.0639, - "step": 1101 - }, - { - "epoch": 5.375609756097561, - "grad_norm": 2.618478536605835, - "learning_rate": 2.2094858420924882e-06, - "loss": 0.0166, - "step": 1102 - }, - { - "epoch": 5.380487804878049, - "grad_norm": 4.526919364929199, - "learning_rate": 2.205680929955635e-06, - "loss": 0.144, - "step": 1103 - }, - { - "epoch": 5.385365853658537, - "grad_norm": 2.7236886024475098, - "learning_rate": 2.201876709029305e-06, - "loss": 0.1004, - "step": 1104 - }, - { - "epoch": 5.390243902439025, - "grad_norm": 2.1577632427215576, - "learning_rate": 2.198073188247738e-06, - "loss": 0.0453, - "step": 1105 - }, - { - "epoch": 5.3951219512195125, - "grad_norm": 2.5170321464538574, - "learning_rate": 2.1942703765435317e-06, - "loss": 0.0195, - "step": 1106 - }, - { - "epoch": 5.4, - "grad_norm": 3.962658643722534, - "learning_rate": 2.190468282847617e-06, - "loss": 0.1512, - "step": 1107 - }, - { - "epoch": 5.404878048780488, - "grad_norm": 4.297860622406006, - "learning_rate": 2.186666916089239e-06, - "loss": 0.2572, - "step": 1108 - }, - { - "epoch": 5.409756097560976, - "grad_norm": 2.8933565616607666, - "learning_rate": 2.1828662851959377e-06, - "loss": 0.0536, - "step": 1109 - }, - { - "epoch": 5.414634146341464, - "grad_norm": 2.9397451877593994, - "learning_rate": 2.1790663990935203e-06, - "loss": 0.0778, - "step": 1110 - }, - { - "epoch": 5.419512195121952, - "grad_norm": 3.5210094451904297, - "learning_rate": 2.1752672667060488e-06, - "loss": 0.0558, - "step": 1111 - }, - { - "epoch": 5.424390243902439, - "grad_norm": 2.9027626514434814, - "learning_rate": 2.1714688969558146e-06, - "loss": 0.041, - "step": 1112 - }, - { - "epoch": 5.429268292682927, - "grad_norm": 3.7691168785095215, - "learning_rate": 2.167671298763316e-06, - "loss": 0.1644, - "step": 1113 - }, - { - "epoch": 5.434146341463415, - "grad_norm": 3.493008852005005, - "learning_rate": 2.1638744810472414e-06, - "loss": 0.1587, - "step": 1114 - }, - { - "epoch": 5.439024390243903, - "grad_norm": 2.711196184158325, - "learning_rate": 2.1600784527244445e-06, - "loss": 0.0605, - "step": 1115 - }, - { - "epoch": 5.443902439024391, - "grad_norm": 4.365038871765137, - "learning_rate": 2.1562832227099266e-06, - "loss": 0.1897, - "step": 1116 - }, - { - "epoch": 5.4487804878048784, - "grad_norm": 4.621466159820557, - "learning_rate": 2.152488799916814e-06, - "loss": 0.1525, - "step": 1117 - }, - { - "epoch": 5.453658536585366, - "grad_norm": 4.8721089363098145, - "learning_rate": 2.148695193256336e-06, - "loss": 0.189, - "step": 1118 - }, - { - "epoch": 5.458536585365854, - "grad_norm": 2.8999173641204834, - "learning_rate": 2.1449024116378064e-06, - "loss": 0.095, - "step": 1119 - }, - { - "epoch": 5.463414634146342, - "grad_norm": 2.4865314960479736, - "learning_rate": 2.1411104639686013e-06, - "loss": 0.0432, - "step": 1120 - }, - { - "epoch": 5.46829268292683, - "grad_norm": 3.8497228622436523, - "learning_rate": 2.137319359154138e-06, - "loss": 0.0954, - "step": 1121 - }, - { - "epoch": 5.473170731707317, - "grad_norm": 2.3643507957458496, - "learning_rate": 2.133529106097853e-06, - "loss": 0.0362, - "step": 1122 - }, - { - "epoch": 5.478048780487805, - "grad_norm": 3.017826795578003, - "learning_rate": 2.1297397137011862e-06, - "loss": 0.0875, - "step": 1123 - }, - { - "epoch": 5.482926829268292, - "grad_norm": 3.239320755004883, - "learning_rate": 2.125951190863551e-06, - "loss": 0.0758, - "step": 1124 - }, - { - "epoch": 5.487804878048781, - "grad_norm": 2.566241979598999, - "learning_rate": 2.1221635464823237e-06, - "loss": 0.0605, - "step": 1125 - }, - { - "epoch": 5.492682926829268, - "grad_norm": 4.810088157653809, - "learning_rate": 2.1183767894528135e-06, - "loss": 0.2403, - "step": 1126 - }, - { - "epoch": 5.4975609756097565, - "grad_norm": 2.083263397216797, - "learning_rate": 2.114590928668249e-06, - "loss": 0.0223, - "step": 1127 - }, - { - "epoch": 5.5024390243902435, - "grad_norm": 2.6812374591827393, - "learning_rate": 2.1108059730197517e-06, - "loss": 0.0617, - "step": 1128 - }, - { - "epoch": 5.507317073170732, - "grad_norm": 3.196735143661499, - "learning_rate": 2.1070219313963173e-06, - "loss": 0.043, - "step": 1129 - }, - { - "epoch": 5.512195121951219, - "grad_norm": 2.775470495223999, - "learning_rate": 2.1032388126847967e-06, - "loss": 0.0595, - "step": 1130 - }, - { - "epoch": 5.517073170731708, - "grad_norm": 2.8632407188415527, - "learning_rate": 2.099456625769872e-06, - "loss": 0.0186, - "step": 1131 - }, - { - "epoch": 5.521951219512195, - "grad_norm": 4.075018405914307, - "learning_rate": 2.0956753795340376e-06, - "loss": 0.0616, - "step": 1132 - }, - { - "epoch": 5.526829268292683, - "grad_norm": 3.206327199935913, - "learning_rate": 2.091895082857578e-06, - "loss": 0.1895, - "step": 1133 - }, - { - "epoch": 5.53170731707317, - "grad_norm": 2.967588186264038, - "learning_rate": 2.0881157446185474e-06, - "loss": 0.0484, - "step": 1134 - }, - { - "epoch": 5.536585365853659, - "grad_norm": 2.850929021835327, - "learning_rate": 2.0843373736927506e-06, - "loss": 0.037, - "step": 1135 - }, - { - "epoch": 5.541463414634146, - "grad_norm": 2.2505147457122803, - "learning_rate": 2.08055997895372e-06, - "loss": 0.0227, - "step": 1136 - }, - { - "epoch": 5.546341463414635, - "grad_norm": 2.5258476734161377, - "learning_rate": 2.0767835692726944e-06, - "loss": 0.0296, - "step": 1137 - }, - { - "epoch": 5.5512195121951216, - "grad_norm": 3.498741388320923, - "learning_rate": 2.0730081535186e-06, - "loss": 0.16, - "step": 1138 - }, - { - "epoch": 5.55609756097561, - "grad_norm": 2.8635222911834717, - "learning_rate": 2.06923374055803e-06, - "loss": 0.0725, - "step": 1139 - }, - { - "epoch": 5.560975609756097, - "grad_norm": 2.2779290676116943, - "learning_rate": 2.0654603392552193e-06, - "loss": 0.0198, - "step": 1140 - }, - { - "epoch": 5.565853658536585, - "grad_norm": 3.1651058197021484, - "learning_rate": 2.0616879584720305e-06, - "loss": 0.1144, - "step": 1141 - }, - { - "epoch": 5.570731707317073, - "grad_norm": 2.4238595962524414, - "learning_rate": 2.057916607067928e-06, - "loss": 0.0491, - "step": 1142 - }, - { - "epoch": 5.575609756097561, - "grad_norm": 2.3248515129089355, - "learning_rate": 2.054146293899957e-06, - "loss": 0.035, - "step": 1143 - }, - { - "epoch": 5.580487804878048, - "grad_norm": 2.9506516456604004, - "learning_rate": 2.0503770278227274e-06, - "loss": 0.0639, - "step": 1144 - }, - { - "epoch": 5.585365853658536, - "grad_norm": 2.6403958797454834, - "learning_rate": 2.0466088176883876e-06, - "loss": 0.0258, - "step": 1145 - }, - { - "epoch": 5.590243902439024, - "grad_norm": 3.150115728378296, - "learning_rate": 2.042841672346608e-06, - "loss": 0.0634, - "step": 1146 - }, - { - "epoch": 5.595121951219512, - "grad_norm": 2.742691993713379, - "learning_rate": 2.039075600644557e-06, - "loss": 0.0464, - "step": 1147 - }, - { - "epoch": 5.6, - "grad_norm": 2.733694076538086, - "learning_rate": 2.0353106114268824e-06, - "loss": 0.0829, - "step": 1148 - }, - { - "epoch": 5.6048780487804875, - "grad_norm": 2.511229991912842, - "learning_rate": 2.031546713535688e-06, - "loss": 0.0321, - "step": 1149 - }, - { - "epoch": 5.609756097560975, - "grad_norm": 3.019669532775879, - "learning_rate": 2.027783915810518e-06, - "loss": 0.05, - "step": 1150 - }, - { - "epoch": 5.614634146341463, - "grad_norm": 3.497159242630005, - "learning_rate": 2.024022227088329e-06, - "loss": 0.1984, - "step": 1151 - }, - { - "epoch": 5.619512195121951, - "grad_norm": 3.4637508392333984, - "learning_rate": 2.020261656203476e-06, - "loss": 0.1673, - "step": 1152 - }, - { - "epoch": 5.624390243902439, - "grad_norm": 2.4312477111816406, - "learning_rate": 2.016502211987687e-06, - "loss": 0.1106, - "step": 1153 - }, - { - "epoch": 5.6292682926829265, - "grad_norm": 2.7801673412323, - "learning_rate": 2.0127439032700446e-06, - "loss": 0.0374, - "step": 1154 - }, - { - "epoch": 5.634146341463414, - "grad_norm": 2.9346680641174316, - "learning_rate": 2.0089867388769664e-06, - "loss": 0.0674, - "step": 1155 - }, - { - "epoch": 5.639024390243902, - "grad_norm": 2.274888277053833, - "learning_rate": 2.0052307276321793e-06, - "loss": 0.0365, - "step": 1156 - }, - { - "epoch": 5.64390243902439, - "grad_norm": 3.069890022277832, - "learning_rate": 2.001475878356703e-06, - "loss": 0.0758, - "step": 1157 - }, - { - "epoch": 5.648780487804878, - "grad_norm": 3.8594915866851807, - "learning_rate": 1.99772219986883e-06, - "loss": 0.176, - "step": 1158 - }, - { - "epoch": 5.6536585365853655, - "grad_norm": 3.4886410236358643, - "learning_rate": 1.9939697009841024e-06, - "loss": 0.0491, - "step": 1159 - }, - { - "epoch": 5.658536585365853, - "grad_norm": 2.697946786880493, - "learning_rate": 1.990218390515291e-06, - "loss": 0.0741, - "step": 1160 - }, - { - "epoch": 5.663414634146341, - "grad_norm": 3.5290887355804443, - "learning_rate": 1.9864682772723757e-06, - "loss": 0.0826, - "step": 1161 - }, - { - "epoch": 5.668292682926829, - "grad_norm": 2.0601298809051514, - "learning_rate": 1.9827193700625274e-06, - "loss": 0.0378, - "step": 1162 - }, - { - "epoch": 5.673170731707317, - "grad_norm": 3.8458635807037354, - "learning_rate": 1.978971677690081e-06, - "loss": 0.2466, - "step": 1163 - }, - { - "epoch": 5.678048780487805, - "grad_norm": 2.788210153579712, - "learning_rate": 1.97522520895652e-06, - "loss": 0.0205, - "step": 1164 - }, - { - "epoch": 5.682926829268292, - "grad_norm": 3.1904587745666504, - "learning_rate": 1.971479972660454e-06, - "loss": 0.0998, - "step": 1165 - }, - { - "epoch": 5.68780487804878, - "grad_norm": 2.4664318561553955, - "learning_rate": 1.967735977597598e-06, - "loss": 0.0217, - "step": 1166 - }, - { - "epoch": 5.692682926829268, - "grad_norm": 2.1392667293548584, - "learning_rate": 1.9639932325607538e-06, - "loss": 0.048, - "step": 1167 - }, - { - "epoch": 5.697560975609756, - "grad_norm": 3.7127058506011963, - "learning_rate": 1.9602517463397845e-06, - "loss": 0.0302, - "step": 1168 - }, - { - "epoch": 5.702439024390244, - "grad_norm": 2.916168689727783, - "learning_rate": 1.9565115277215978e-06, - "loss": 0.0724, - "step": 1169 - }, - { - "epoch": 5.7073170731707314, - "grad_norm": 2.4352428913116455, - "learning_rate": 1.952772585490127e-06, - "loss": 0.0464, - "step": 1170 - }, - { - "epoch": 5.712195121951219, - "grad_norm": 2.8311455249786377, - "learning_rate": 1.9490349284263036e-06, - "loss": 0.0239, - "step": 1171 - }, - { - "epoch": 5.717073170731707, - "grad_norm": 3.3592801094055176, - "learning_rate": 1.9452985653080443e-06, - "loss": 0.0719, - "step": 1172 - }, - { - "epoch": 5.721951219512195, - "grad_norm": 2.450922966003418, - "learning_rate": 1.9415635049102245e-06, - "loss": 0.0408, - "step": 1173 - }, - { - "epoch": 5.726829268292683, - "grad_norm": 4.750118255615234, - "learning_rate": 1.937829756004662e-06, - "loss": 0.2049, - "step": 1174 - }, - { - "epoch": 5.7317073170731705, - "grad_norm": 3.0643811225891113, - "learning_rate": 1.9340973273600944e-06, - "loss": 0.0636, - "step": 1175 - }, - { - "epoch": 5.736585365853658, - "grad_norm": 3.313904047012329, - "learning_rate": 1.930366227742157e-06, - "loss": 0.1252, - "step": 1176 - }, - { - "epoch": 5.741463414634146, - "grad_norm": 3.8996808528900146, - "learning_rate": 1.9266364659133653e-06, - "loss": 0.0687, - "step": 1177 - }, - { - "epoch": 5.746341463414634, - "grad_norm": 2.727555274963379, - "learning_rate": 1.922908050633093e-06, - "loss": 0.0333, - "step": 1178 - }, - { - "epoch": 5.751219512195122, - "grad_norm": 3.270087718963623, - "learning_rate": 1.919180990657551e-06, - "loss": 0.0792, - "step": 1179 - }, - { - "epoch": 5.7560975609756095, - "grad_norm": 2.6631274223327637, - "learning_rate": 1.9154552947397668e-06, - "loss": 0.069, - "step": 1180 - }, - { - "epoch": 5.760975609756097, - "grad_norm": 4.4460554122924805, - "learning_rate": 1.9117309716295658e-06, - "loss": 0.115, - "step": 1181 - }, - { - "epoch": 5.765853658536585, - "grad_norm": 2.5652341842651367, - "learning_rate": 1.9080080300735478e-06, - "loss": 0.0537, - "step": 1182 - }, - { - "epoch": 5.770731707317073, - "grad_norm": 3.046436071395874, - "learning_rate": 1.9042864788150695e-06, - "loss": 0.0817, - "step": 1183 - }, - { - "epoch": 5.775609756097561, - "grad_norm": 2.121629238128662, - "learning_rate": 1.9005663265942206e-06, - "loss": 0.0289, - "step": 1184 - }, - { - "epoch": 5.780487804878049, - "grad_norm": 2.271918535232544, - "learning_rate": 1.8968475821478066e-06, - "loss": 0.0357, - "step": 1185 - }, - { - "epoch": 5.785365853658536, - "grad_norm": 2.582473039627075, - "learning_rate": 1.8931302542093274e-06, - "loss": 0.0584, - "step": 1186 - }, - { - "epoch": 5.790243902439024, - "grad_norm": 2.502952814102173, - "learning_rate": 1.8894143515089539e-06, - "loss": 0.0324, - "step": 1187 - }, - { - "epoch": 5.795121951219512, - "grad_norm": 1.9735453128814697, - "learning_rate": 1.8856998827735118e-06, - "loss": 0.0338, - "step": 1188 - }, - { - "epoch": 5.8, - "grad_norm": 4.441845893859863, - "learning_rate": 1.8819868567264588e-06, - "loss": 0.1706, - "step": 1189 - }, - { - "epoch": 5.804878048780488, - "grad_norm": 2.5450692176818848, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.0463, - "step": 1190 - }, - { - "epoch": 5.809756097560975, - "grad_norm": 3.718183755874634, - "learning_rate": 1.8745651675743876e-06, - "loss": 0.1188, - "step": 1191 - }, - { - "epoch": 5.814634146341463, - "grad_norm": 3.246532678604126, - "learning_rate": 1.870856521899261e-06, - "loss": 0.0984, - "step": 1192 - }, - { - "epoch": 5.819512195121951, - "grad_norm": 2.9522783756256104, - "learning_rate": 1.867149353772267e-06, - "loss": 0.0195, - "step": 1193 - }, - { - "epoch": 5.824390243902439, - "grad_norm": 2.3266429901123047, - "learning_rate": 1.863443671899717e-06, - "loss": 0.0236, - "step": 1194 - }, - { - "epoch": 5.829268292682927, - "grad_norm": 3.696749448776245, - "learning_rate": 1.8597394849844319e-06, - "loss": 0.1108, - "step": 1195 - }, - { - "epoch": 5.8341463414634145, - "grad_norm": 2.375624179840088, - "learning_rate": 1.8560368017257229e-06, - "loss": 0.0388, - "step": 1196 - }, - { - "epoch": 5.839024390243902, - "grad_norm": 4.0437092781066895, - "learning_rate": 1.8523356308193696e-06, - "loss": 0.3098, - "step": 1197 - }, - { - "epoch": 5.84390243902439, - "grad_norm": 3.165165424346924, - "learning_rate": 1.8486359809575977e-06, - "loss": 0.0775, - "step": 1198 - }, - { - "epoch": 5.848780487804878, - "grad_norm": 4.1991190910339355, - "learning_rate": 1.8449378608290638e-06, - "loss": 0.1222, - "step": 1199 - }, - { - "epoch": 5.853658536585366, - "grad_norm": 4.6657819747924805, - "learning_rate": 1.8412412791188306e-06, - "loss": 0.1146, - "step": 1200 - }, - { - "epoch": 5.8585365853658535, - "grad_norm": 4.569516181945801, - "learning_rate": 1.8375462445083464e-06, - "loss": 0.1113, - "step": 1201 - }, - { - "epoch": 5.863414634146341, - "grad_norm": 3.1565654277801514, - "learning_rate": 1.8338527656754285e-06, - "loss": 0.0416, - "step": 1202 - }, - { - "epoch": 5.868292682926829, - "grad_norm": 3.3474619388580322, - "learning_rate": 1.830160851294239e-06, - "loss": 0.0613, - "step": 1203 - }, - { - "epoch": 5.873170731707317, - "grad_norm": 4.30797004699707, - "learning_rate": 1.8264705100352662e-06, - "loss": 0.197, - "step": 1204 - }, - { - "epoch": 5.878048780487805, - "grad_norm": 2.7259573936462402, - "learning_rate": 1.8227817505653045e-06, - "loss": 0.0821, - "step": 1205 - }, - { - "epoch": 5.882926829268293, - "grad_norm": 3.515812873840332, - "learning_rate": 1.8190945815474323e-06, - "loss": 0.1246, - "step": 1206 - }, - { - "epoch": 5.88780487804878, - "grad_norm": 2.9223313331604004, - "learning_rate": 1.8154090116409934e-06, - "loss": 0.0703, - "step": 1207 - }, - { - "epoch": 5.892682926829268, - "grad_norm": 3.9529640674591064, - "learning_rate": 1.811725049501577e-06, - "loss": 0.1078, - "step": 1208 - }, - { - "epoch": 5.897560975609756, - "grad_norm": 4.1674580574035645, - "learning_rate": 1.8080427037809941e-06, - "loss": 0.1648, - "step": 1209 - }, - { - "epoch": 5.902439024390244, - "grad_norm": 3.1308021545410156, - "learning_rate": 1.8043619831272623e-06, - "loss": 0.061, - "step": 1210 - }, - { - "epoch": 5.907317073170732, - "grad_norm": 3.9667179584503174, - "learning_rate": 1.8006828961845807e-06, - "loss": 0.1863, - "step": 1211 - }, - { - "epoch": 5.912195121951219, - "grad_norm": 5.438168048858643, - "learning_rate": 1.7970054515933124e-06, - "loss": 0.2387, - "step": 1212 - }, - { - "epoch": 5.917073170731707, - "grad_norm": 5.505797863006592, - "learning_rate": 1.793329657989964e-06, - "loss": 0.2053, - "step": 1213 - }, - { - "epoch": 5.921951219512195, - "grad_norm": 2.8043150901794434, - "learning_rate": 1.7896555240071627e-06, - "loss": 0.026, - "step": 1214 - }, - { - "epoch": 5.926829268292683, - "grad_norm": 2.836164712905884, - "learning_rate": 1.7859830582736406e-06, - "loss": 0.0735, - "step": 1215 - }, - { - "epoch": 5.931707317073171, - "grad_norm": 2.8286306858062744, - "learning_rate": 1.782312269414211e-06, - "loss": 0.0586, - "step": 1216 - }, - { - "epoch": 5.9365853658536585, - "grad_norm": 4.4354329109191895, - "learning_rate": 1.7786431660497474e-06, - "loss": 0.3086, - "step": 1217 - }, - { - "epoch": 5.941463414634146, - "grad_norm": 4.0963640213012695, - "learning_rate": 1.7749757567971678e-06, - "loss": 0.0978, - "step": 1218 - }, - { - "epoch": 5.946341463414634, - "grad_norm": 2.726062536239624, - "learning_rate": 1.7713100502694091e-06, - "loss": 0.0976, - "step": 1219 - }, - { - "epoch": 5.951219512195122, - "grad_norm": 2.6566951274871826, - "learning_rate": 1.7676460550754104e-06, - "loss": 0.02, - "step": 1220 - }, - { - "epoch": 5.95609756097561, - "grad_norm": 2.7710952758789062, - "learning_rate": 1.7639837798200923e-06, - "loss": 0.0741, - "step": 1221 - }, - { - "epoch": 5.9609756097560975, - "grad_norm": 2.3678600788116455, - "learning_rate": 1.7603232331043346e-06, - "loss": 0.0542, - "step": 1222 - }, - { - "epoch": 5.965853658536585, - "grad_norm": 6.45259428024292, - "learning_rate": 1.7566644235249591e-06, - "loss": 0.3552, - "step": 1223 - }, - { - "epoch": 5.970731707317073, - "grad_norm": 1.8916475772857666, - "learning_rate": 1.7530073596747072e-06, - "loss": 0.0405, - "step": 1224 - }, - { - "epoch": 5.975609756097561, - "grad_norm": 2.1637566089630127, - "learning_rate": 1.74935205014222e-06, - "loss": 0.0178, - "step": 1225 - }, - { - "epoch": 5.980487804878049, - "grad_norm": 2.5959200859069824, - "learning_rate": 1.7456985035120194e-06, - "loss": 0.0264, - "step": 1226 - }, - { - "epoch": 5.985365853658537, - "grad_norm": 2.50264573097229, - "learning_rate": 1.7420467283644877e-06, - "loss": 0.0555, - "step": 1227 - }, - { - "epoch": 5.990243902439024, - "grad_norm": 2.4692020416259766, - "learning_rate": 1.738396733275844e-06, - "loss": 0.0546, - "step": 1228 - }, - { - "epoch": 5.995121951219512, - "grad_norm": 5.540846824645996, - "learning_rate": 1.7347485268181309e-06, - "loss": 0.1967, - "step": 1229 - }, - { - "epoch": 6.0, - "grad_norm": 1.8322839736938477, - "learning_rate": 1.7311021175591868e-06, - "loss": 0.0491, - "step": 1230 - }, - { - "epoch": 6.004878048780488, - "grad_norm": 2.719622850418091, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.0359, - "step": 1231 - }, - { - "epoch": 6.009756097560976, - "grad_norm": 2.859675884246826, - "learning_rate": 1.7238147248878444e-06, - "loss": 0.0585, - "step": 1232 - }, - { - "epoch": 6.014634146341463, - "grad_norm": 1.6761114597320557, - "learning_rate": 1.7201737585899415e-06, - "loss": 0.0188, - "step": 1233 - }, - { - "epoch": 6.019512195121951, - "grad_norm": 2.1588776111602783, - "learning_rate": 1.7165346237197594e-06, - "loss": 0.0484, - "step": 1234 - }, - { - "epoch": 6.024390243902439, - "grad_norm": 4.209983825683594, - "learning_rate": 1.7128973288238344e-06, - "loss": 0.0776, - "step": 1235 - }, - { - "epoch": 6.029268292682927, - "grad_norm": 2.3979365825653076, - "learning_rate": 1.709261882444379e-06, - "loss": 0.0338, - "step": 1236 - }, - { - "epoch": 6.034146341463415, - "grad_norm": 3.0030531883239746, - "learning_rate": 1.705628293119268e-06, - "loss": 0.0385, - "step": 1237 - }, - { - "epoch": 6.0390243902439025, - "grad_norm": 9.65616512298584, - "learning_rate": 1.701996569382011e-06, - "loss": 0.2601, - "step": 1238 - }, - { - "epoch": 6.04390243902439, - "grad_norm": 3.0590052604675293, - "learning_rate": 1.6983667197617386e-06, - "loss": 0.034, - "step": 1239 - }, - { - "epoch": 6.048780487804878, - "grad_norm": 3.6949822902679443, - "learning_rate": 1.6947387527831813e-06, - "loss": 0.0155, - "step": 1240 - }, - { - "epoch": 6.053658536585366, - "grad_norm": 1.2870460748672485, - "learning_rate": 1.6911126769666442e-06, - "loss": 0.0078, - "step": 1241 - }, - { - "epoch": 6.058536585365854, - "grad_norm": 4.307460784912109, - "learning_rate": 1.6874885008279945e-06, - "loss": 0.1429, - "step": 1242 - }, - { - "epoch": 6.0634146341463415, - "grad_norm": 2.334972858428955, - "learning_rate": 1.683866232878637e-06, - "loss": 0.0123, - "step": 1243 - }, - { - "epoch": 6.068292682926829, - "grad_norm": 2.4121835231781006, - "learning_rate": 1.6802458816254941e-06, - "loss": 0.0139, - "step": 1244 - }, - { - "epoch": 6.073170731707317, - "grad_norm": 1.9224514961242676, - "learning_rate": 1.676627455570988e-06, - "loss": 0.0312, - "step": 1245 - }, - { - "epoch": 6.078048780487805, - "grad_norm": 2.8293309211730957, - "learning_rate": 1.6730109632130199e-06, - "loss": 0.0464, - "step": 1246 - }, - { - "epoch": 6.082926829268293, - "grad_norm": 1.6368179321289062, - "learning_rate": 1.6693964130449472e-06, - "loss": 0.0085, - "step": 1247 - }, - { - "epoch": 6.087804878048781, - "grad_norm": 2.5535073280334473, - "learning_rate": 1.6657838135555696e-06, - "loss": 0.0482, - "step": 1248 - }, - { - "epoch": 6.092682926829268, - "grad_norm": 3.7743096351623535, - "learning_rate": 1.6621731732291024e-06, - "loss": 0.0235, - "step": 1249 - }, - { - "epoch": 6.097560975609756, - "grad_norm": 2.9921820163726807, - "learning_rate": 1.6585645005451623e-06, - "loss": 0.0455, - "step": 1250 - }, - { - "epoch": 6.102439024390244, - "grad_norm": 2.369581937789917, - "learning_rate": 1.6549578039787436e-06, - "loss": 0.0499, - "step": 1251 - }, - { - "epoch": 6.107317073170732, - "grad_norm": 2.163815498352051, - "learning_rate": 1.6513530920001998e-06, - "loss": 0.0118, - "step": 1252 - }, - { - "epoch": 6.11219512195122, - "grad_norm": 2.034928560256958, - "learning_rate": 1.6477503730752237e-06, - "loss": 0.0189, - "step": 1253 - }, - { - "epoch": 6.117073170731707, - "grad_norm": 2.7306160926818848, - "learning_rate": 1.6441496556648278e-06, - "loss": 0.0492, - "step": 1254 - }, - { - "epoch": 6.121951219512195, - "grad_norm": 3.7521040439605713, - "learning_rate": 1.6405509482253234e-06, - "loss": 0.1717, - "step": 1255 - }, - { - "epoch": 6.126829268292683, - "grad_norm": 1.8965831995010376, - "learning_rate": 1.636954259208302e-06, - "loss": 0.0194, - "step": 1256 - }, - { - "epoch": 6.131707317073171, - "grad_norm": 3.010024070739746, - "learning_rate": 1.6333595970606143e-06, - "loss": 0.0334, - "step": 1257 - }, - { - "epoch": 6.136585365853659, - "grad_norm": 3.7091450691223145, - "learning_rate": 1.62976697022435e-06, - "loss": 0.0705, - "step": 1258 - }, - { - "epoch": 6.1414634146341465, - "grad_norm": 3.5719785690307617, - "learning_rate": 1.6261763871368225e-06, - "loss": 0.0322, - "step": 1259 - }, - { - "epoch": 6.146341463414634, - "grad_norm": 3.3224213123321533, - "learning_rate": 1.6225878562305403e-06, - "loss": 0.0653, - "step": 1260 - }, - { - "epoch": 6.151219512195122, - "grad_norm": 3.78924822807312, - "learning_rate": 1.6190013859331958e-06, - "loss": 0.0557, - "step": 1261 - }, - { - "epoch": 6.15609756097561, - "grad_norm": 2.429412841796875, - "learning_rate": 1.6154169846676415e-06, - "loss": 0.0277, - "step": 1262 - }, - { - "epoch": 6.160975609756098, - "grad_norm": 2.626167058944702, - "learning_rate": 1.6118346608518698e-06, - "loss": 0.0305, - "step": 1263 - }, - { - "epoch": 6.1658536585365855, - "grad_norm": 2.44846248626709, - "learning_rate": 1.6082544228989958e-06, - "loss": 0.0093, - "step": 1264 - }, - { - "epoch": 6.170731707317073, - "grad_norm": 2.9345643520355225, - "learning_rate": 1.6046762792172336e-06, - "loss": 0.0198, - "step": 1265 - }, - { - "epoch": 6.175609756097561, - "grad_norm": 3.224313497543335, - "learning_rate": 1.6011002382098806e-06, - "loss": 0.0673, - "step": 1266 - }, - { - "epoch": 6.180487804878049, - "grad_norm": 1.9066869020462036, - "learning_rate": 1.5975263082752968e-06, - "loss": 0.0115, - "step": 1267 - }, - { - "epoch": 6.185365853658537, - "grad_norm": 2.7153308391571045, - "learning_rate": 1.5939544978068816e-06, - "loss": 0.0529, - "step": 1268 - }, - { - "epoch": 6.190243902439025, - "grad_norm": 2.2173709869384766, - "learning_rate": 1.590384815193059e-06, - "loss": 0.0643, - "step": 1269 - }, - { - "epoch": 6.195121951219512, - "grad_norm": 3.1238555908203125, - "learning_rate": 1.5868172688172559e-06, - "loss": 0.064, - "step": 1270 - }, - { - "epoch": 6.2, - "grad_norm": 2.7765870094299316, - "learning_rate": 1.5832518670578802e-06, - "loss": 0.0676, - "step": 1271 - }, - { - "epoch": 6.204878048780488, - "grad_norm": 2.9892525672912598, - "learning_rate": 1.5796886182883053e-06, - "loss": 0.074, - "step": 1272 - }, - { - "epoch": 6.209756097560976, - "grad_norm": 2.0955512523651123, - "learning_rate": 1.5761275308768476e-06, - "loss": 0.0311, - "step": 1273 - }, - { - "epoch": 6.214634146341464, - "grad_norm": 1.8085861206054688, - "learning_rate": 1.5725686131867462e-06, - "loss": 0.0108, - "step": 1274 - }, - { - "epoch": 6.219512195121951, - "grad_norm": 3.026421308517456, - "learning_rate": 1.569011873576147e-06, - "loss": 0.0464, - "step": 1275 - }, - { - "epoch": 6.224390243902439, - "grad_norm": 2.3395111560821533, - "learning_rate": 1.5654573203980782e-06, - "loss": 0.0221, - "step": 1276 - }, - { - "epoch": 6.229268292682927, - "grad_norm": 3.6158692836761475, - "learning_rate": 1.5619049620004354e-06, - "loss": 0.0693, - "step": 1277 - }, - { - "epoch": 6.234146341463415, - "grad_norm": 1.6186567544937134, - "learning_rate": 1.5583548067259584e-06, - "loss": 0.0198, - "step": 1278 - }, - { - "epoch": 6.239024390243903, - "grad_norm": 2.7193195819854736, - "learning_rate": 1.5548068629122126e-06, - "loss": 0.0687, - "step": 1279 - }, - { - "epoch": 6.2439024390243905, - "grad_norm": 2.7472658157348633, - "learning_rate": 1.5512611388915711e-06, - "loss": 0.053, - "step": 1280 - }, - { - "epoch": 6.248780487804878, - "grad_norm": 4.694706439971924, - "learning_rate": 1.5477176429911934e-06, - "loss": 0.2076, - "step": 1281 - }, - { - "epoch": 6.253658536585366, - "grad_norm": 1.609309434890747, - "learning_rate": 1.5441763835330048e-06, - "loss": 0.0108, - "step": 1282 - }, - { - "epoch": 6.258536585365854, - "grad_norm": 1.7064504623413086, - "learning_rate": 1.5406373688336807e-06, - "loss": 0.0114, - "step": 1283 - }, - { - "epoch": 6.263414634146342, - "grad_norm": 1.967726469039917, - "learning_rate": 1.5371006072046225e-06, - "loss": 0.0209, - "step": 1284 - }, - { - "epoch": 6.2682926829268295, - "grad_norm": 2.4065544605255127, - "learning_rate": 1.5335661069519408e-06, - "loss": 0.0741, - "step": 1285 - }, - { - "epoch": 6.273170731707317, - "grad_norm": 2.2167603969573975, - "learning_rate": 1.5300338763764371e-06, - "loss": 0.0121, - "step": 1286 - }, - { - "epoch": 6.278048780487805, - "grad_norm": 3.229228973388672, - "learning_rate": 1.5265039237735804e-06, - "loss": 0.0226, - "step": 1287 - }, - { - "epoch": 6.282926829268293, - "grad_norm": 1.889419674873352, - "learning_rate": 1.5229762574334903e-06, - "loss": 0.0116, - "step": 1288 - }, - { - "epoch": 6.287804878048781, - "grad_norm": 3.7595815658569336, - "learning_rate": 1.5194508856409181e-06, - "loss": 0.0775, - "step": 1289 - }, - { - "epoch": 6.2926829268292686, - "grad_norm": 2.527560234069824, - "learning_rate": 1.515927816675225e-06, - "loss": 0.0355, - "step": 1290 - }, - { - "epoch": 6.297560975609756, - "grad_norm": 1.9718955755233765, - "learning_rate": 1.5124070588103648e-06, - "loss": 0.0127, - "step": 1291 - }, - { - "epoch": 6.302439024390244, - "grad_norm": 1.9010120630264282, - "learning_rate": 1.5088886203148643e-06, - "loss": 0.0188, - "step": 1292 - }, - { - "epoch": 6.307317073170732, - "grad_norm": 3.2093472480773926, - "learning_rate": 1.505372509451801e-06, - "loss": 0.0845, - "step": 1293 - }, - { - "epoch": 6.31219512195122, - "grad_norm": 1.6723257303237915, - "learning_rate": 1.5018587344787888e-06, - "loss": 0.0265, - "step": 1294 - }, - { - "epoch": 6.317073170731708, - "grad_norm": 3.246812343597412, - "learning_rate": 1.498347303647953e-06, - "loss": 0.0833, - "step": 1295 - }, - { - "epoch": 6.321951219512195, - "grad_norm": 2.887834072113037, - "learning_rate": 1.4948382252059158e-06, - "loss": 0.0416, - "step": 1296 - }, - { - "epoch": 6.326829268292683, - "grad_norm": 2.5762557983398438, - "learning_rate": 1.4913315073937742e-06, - "loss": 0.0614, - "step": 1297 - }, - { - "epoch": 6.331707317073171, - "grad_norm": 3.3746497631073, - "learning_rate": 1.4878271584470805e-06, - "loss": 0.0601, - "step": 1298 - }, - { - "epoch": 6.336585365853659, - "grad_norm": 2.4984664916992188, - "learning_rate": 1.4843251865958242e-06, - "loss": 0.0189, - "step": 1299 - }, - { - "epoch": 6.341463414634147, - "grad_norm": 3.178300619125366, - "learning_rate": 1.4808256000644128e-06, - "loss": 0.038, - "step": 1300 - }, - { - "epoch": 6.3463414634146345, - "grad_norm": 2.6362273693084717, - "learning_rate": 1.4773284070716504e-06, - "loss": 0.041, - "step": 1301 - }, - { - "epoch": 6.351219512195122, - "grad_norm": 2.1512129306793213, - "learning_rate": 1.473833615830722e-06, - "loss": 0.0227, - "step": 1302 - }, - { - "epoch": 6.35609756097561, - "grad_norm": 2.2898178100585938, - "learning_rate": 1.4703412345491692e-06, - "loss": 0.039, - "step": 1303 - }, - { - "epoch": 6.360975609756098, - "grad_norm": 2.6641080379486084, - "learning_rate": 1.4668512714288763e-06, - "loss": 0.0431, - "step": 1304 - }, - { - "epoch": 6.365853658536586, - "grad_norm": 1.7466667890548706, - "learning_rate": 1.4633637346660478e-06, - "loss": 0.013, - "step": 1305 - }, - { - "epoch": 6.3707317073170735, - "grad_norm": 2.437889575958252, - "learning_rate": 1.4598786324511892e-06, - "loss": 0.0181, - "step": 1306 - }, - { - "epoch": 6.375609756097561, - "grad_norm": 2.5054142475128174, - "learning_rate": 1.456395972969089e-06, - "loss": 0.0248, - "step": 1307 - }, - { - "epoch": 6.380487804878049, - "grad_norm": 3.2294511795043945, - "learning_rate": 1.4529157643987995e-06, - "loss": 0.0561, - "step": 1308 - }, - { - "epoch": 6.385365853658537, - "grad_norm": 2.260188341140747, - "learning_rate": 1.4494380149136162e-06, - "loss": 0.0593, - "step": 1309 - }, - { - "epoch": 6.390243902439025, - "grad_norm": 2.4961163997650146, - "learning_rate": 1.4459627326810576e-06, - "loss": 0.0257, - "step": 1310 - }, - { - "epoch": 6.3951219512195125, - "grad_norm": 3.4153239727020264, - "learning_rate": 1.4424899258628533e-06, - "loss": 0.0223, - "step": 1311 - }, - { - "epoch": 6.4, - "grad_norm": 2.6308839321136475, - "learning_rate": 1.439019602614914e-06, - "loss": 0.0112, - "step": 1312 - }, - { - "epoch": 6.404878048780488, - "grad_norm": 2.754530191421509, - "learning_rate": 1.4355517710873184e-06, - "loss": 0.068, - "step": 1313 - }, - { - "epoch": 6.409756097560976, - "grad_norm": 4.473151683807373, - "learning_rate": 1.432086439424297e-06, - "loss": 0.0825, - "step": 1314 - }, - { - "epoch": 6.414634146341464, - "grad_norm": 4.85701322555542, - "learning_rate": 1.428623615764206e-06, - "loss": 0.1812, - "step": 1315 - }, - { - "epoch": 6.419512195121952, - "grad_norm": 1.6678224802017212, - "learning_rate": 1.4251633082395117e-06, - "loss": 0.0207, - "step": 1316 - }, - { - "epoch": 6.424390243902439, - "grad_norm": 2.9730937480926514, - "learning_rate": 1.4217055249767734e-06, - "loss": 0.0617, - "step": 1317 - }, - { - "epoch": 6.429268292682927, - "grad_norm": 2.503786563873291, - "learning_rate": 1.4182502740966203e-06, - "loss": 0.0137, - "step": 1318 - }, - { - "epoch": 6.434146341463415, - "grad_norm": 3.0798017978668213, - "learning_rate": 1.4147975637137334e-06, - "loss": 0.0329, - "step": 1319 - }, - { - "epoch": 6.439024390243903, - "grad_norm": 3.008155345916748, - "learning_rate": 1.411347401936831e-06, - "loss": 0.0487, - "step": 1320 - }, - { - "epoch": 6.443902439024391, - "grad_norm": 2.5451765060424805, - "learning_rate": 1.4078997968686425e-06, - "loss": 0.0582, - "step": 1321 - }, - { - "epoch": 6.4487804878048784, - "grad_norm": 2.042696475982666, - "learning_rate": 1.404454756605893e-06, - "loss": 0.0336, - "step": 1322 - }, - { - "epoch": 6.453658536585366, - "grad_norm": 3.0421411991119385, - "learning_rate": 1.4010122892392872e-06, - "loss": 0.1372, - "step": 1323 - }, - { - "epoch": 6.458536585365854, - "grad_norm": 2.0793251991271973, - "learning_rate": 1.3975724028534842e-06, - "loss": 0.0452, - "step": 1324 - }, - { - "epoch": 6.463414634146342, - "grad_norm": 2.6149914264678955, - "learning_rate": 1.394135105527083e-06, - "loss": 0.0431, - "step": 1325 - }, - { - "epoch": 6.46829268292683, - "grad_norm": 2.818507671356201, - "learning_rate": 1.3907004053326006e-06, - "loss": 0.0242, - "step": 1326 - }, - { - "epoch": 6.473170731707317, - "grad_norm": 2.328993558883667, - "learning_rate": 1.387268310336458e-06, - "loss": 0.0293, - "step": 1327 - }, - { - "epoch": 6.478048780487805, - "grad_norm": 2.2032642364501953, - "learning_rate": 1.3838388285989552e-06, - "loss": 0.0232, - "step": 1328 - }, - { - "epoch": 6.482926829268292, - "grad_norm": 2.039983034133911, - "learning_rate": 1.380411968174254e-06, - "loss": 0.0256, - "step": 1329 - }, - { - "epoch": 6.487804878048781, - "grad_norm": 3.7261271476745605, - "learning_rate": 1.3769877371103635e-06, - "loss": 0.1285, - "step": 1330 - }, - { - "epoch": 6.492682926829268, - "grad_norm": 3.7156264781951904, - "learning_rate": 1.373566143449115e-06, - "loss": 0.1621, - "step": 1331 - }, - { - "epoch": 6.4975609756097565, - "grad_norm": 1.5905455350875854, - "learning_rate": 1.3701471952261457e-06, - "loss": 0.0126, - "step": 1332 - }, - { - "epoch": 6.5024390243902435, - "grad_norm": 2.8808465003967285, - "learning_rate": 1.3667309004708832e-06, - "loss": 0.0211, - "step": 1333 - }, - { - "epoch": 6.507317073170732, - "grad_norm": 3.9190757274627686, - "learning_rate": 1.3633172672065195e-06, - "loss": 0.062, - "step": 1334 - }, - { - "epoch": 6.512195121951219, - "grad_norm": 1.6948635578155518, - "learning_rate": 1.359906303449997e-06, - "loss": 0.0126, - "step": 1335 - }, - { - "epoch": 6.517073170731708, - "grad_norm": 2.3967642784118652, - "learning_rate": 1.3564980172119913e-06, - "loss": 0.0111, - "step": 1336 - }, - { - "epoch": 6.521951219512195, - "grad_norm": 3.5275399684906006, - "learning_rate": 1.3530924164968873e-06, - "loss": 0.1024, - "step": 1337 - }, - { - "epoch": 6.526829268292683, - "grad_norm": 2.0768814086914062, - "learning_rate": 1.3496895093027617e-06, - "loss": 0.0254, - "step": 1338 - }, - { - "epoch": 6.53170731707317, - "grad_norm": 1.8964029550552368, - "learning_rate": 1.3462893036213706e-06, - "loss": 0.0188, - "step": 1339 - }, - { - "epoch": 6.536585365853659, - "grad_norm": 1.679545283317566, - "learning_rate": 1.3428918074381203e-06, - "loss": 0.0195, - "step": 1340 - }, - { - "epoch": 6.541463414634146, - "grad_norm": 2.204637050628662, - "learning_rate": 1.3394970287320553e-06, - "loss": 0.0317, - "step": 1341 - }, - { - "epoch": 6.546341463414635, - "grad_norm": 2.014052629470825, - "learning_rate": 1.3361049754758404e-06, - "loss": 0.0191, - "step": 1342 - }, - { - "epoch": 6.5512195121951216, - "grad_norm": 1.4630589485168457, - "learning_rate": 1.3327156556357369e-06, - "loss": 0.0079, - "step": 1343 - }, - { - "epoch": 6.55609756097561, - "grad_norm": 2.876132011413574, - "learning_rate": 1.3293290771715875e-06, - "loss": 0.0345, - "step": 1344 - }, - { - "epoch": 6.560975609756097, - "grad_norm": 1.793338656425476, - "learning_rate": 1.3259452480367963e-06, - "loss": 0.0409, - "step": 1345 - }, - { - "epoch": 6.565853658536585, - "grad_norm": 2.2791552543640137, - "learning_rate": 1.3225641761783126e-06, - "loss": 0.0494, - "step": 1346 - }, - { - "epoch": 6.570731707317073, - "grad_norm": 4.255206108093262, - "learning_rate": 1.3191858695366084e-06, - "loss": 0.0842, - "step": 1347 - }, - { - "epoch": 6.575609756097561, - "grad_norm": 2.449460506439209, - "learning_rate": 1.3158103360456603e-06, - "loss": 0.0399, - "step": 1348 - }, - { - "epoch": 6.580487804878048, - "grad_norm": 2.780730724334717, - "learning_rate": 1.3124375836329362e-06, - "loss": 0.0272, - "step": 1349 - }, - { - "epoch": 6.585365853658536, - "grad_norm": 1.925681233406067, - "learning_rate": 1.3090676202193692e-06, - "loss": 0.007, - "step": 1350 - }, - { - "epoch": 6.590243902439024, - "grad_norm": 2.069791555404663, - "learning_rate": 1.3057004537193424e-06, - "loss": 0.016, - "step": 1351 - }, - { - "epoch": 6.595121951219512, - "grad_norm": 1.863872766494751, - "learning_rate": 1.302336092040673e-06, - "loss": 0.016, - "step": 1352 - }, - { - "epoch": 6.6, - "grad_norm": 2.351259231567383, - "learning_rate": 1.298974543084589e-06, - "loss": 0.0172, - "step": 1353 - }, - { - "epoch": 6.6048780487804875, - "grad_norm": 1.848115086555481, - "learning_rate": 1.2956158147457116e-06, - "loss": 0.0412, - "step": 1354 - }, - { - "epoch": 6.609756097560975, - "grad_norm": 1.6395928859710693, - "learning_rate": 1.2922599149120412e-06, - "loss": 0.0181, - "step": 1355 - }, - { - "epoch": 6.614634146341463, - "grad_norm": 2.1267426013946533, - "learning_rate": 1.2889068514649328e-06, - "loss": 0.04, - "step": 1356 - }, - { - "epoch": 6.619512195121951, - "grad_norm": 1.6603496074676514, - "learning_rate": 1.2855566322790796e-06, - "loss": 0.0108, - "step": 1357 - }, - { - "epoch": 6.624390243902439, - "grad_norm": 2.2724838256835938, - "learning_rate": 1.2822092652224989e-06, - "loss": 0.0284, - "step": 1358 - }, - { - "epoch": 6.6292682926829265, - "grad_norm": 2.222623825073242, - "learning_rate": 1.2788647581565048e-06, - "loss": 0.0128, - "step": 1359 - }, - { - "epoch": 6.634146341463414, - "grad_norm": 2.710681676864624, - "learning_rate": 1.275523118935697e-06, - "loss": 0.0184, - "step": 1360 - }, - { - "epoch": 6.639024390243902, - "grad_norm": 2.354264736175537, - "learning_rate": 1.2721843554079418e-06, - "loss": 0.0313, - "step": 1361 - }, - { - "epoch": 6.64390243902439, - "grad_norm": 3.886909008026123, - "learning_rate": 1.2688484754143493e-06, - "loss": 0.1184, - "step": 1362 - }, - { - "epoch": 6.648780487804878, - "grad_norm": 3.088468313217163, - "learning_rate": 1.2655154867892577e-06, - "loss": 0.0353, - "step": 1363 - }, - { - "epoch": 6.6536585365853655, - "grad_norm": 2.987576484680176, - "learning_rate": 1.2621853973602158e-06, - "loss": 0.0349, - "step": 1364 - }, - { - "epoch": 6.658536585365853, - "grad_norm": 1.719212293624878, - "learning_rate": 1.2588582149479645e-06, - "loss": 0.0081, - "step": 1365 - }, - { - "epoch": 6.663414634146341, - "grad_norm": 2.1641178131103516, - "learning_rate": 1.2555339473664151e-06, - "loss": 0.0279, - "step": 1366 - }, - { - "epoch": 6.668292682926829, - "grad_norm": 2.9424984455108643, - "learning_rate": 1.2522126024226347e-06, - "loss": 0.0492, - "step": 1367 - }, - { - "epoch": 6.673170731707317, - "grad_norm": 1.961077332496643, - "learning_rate": 1.2488941879168278e-06, - "loss": 0.0084, - "step": 1368 - }, - { - "epoch": 6.678048780487805, - "grad_norm": 2.302565097808838, - "learning_rate": 1.2455787116423148e-06, - "loss": 0.0486, - "step": 1369 - }, - { - "epoch": 6.682926829268292, - "grad_norm": 2.187194347381592, - "learning_rate": 1.2422661813855158e-06, - "loss": 0.0319, - "step": 1370 - }, - { - "epoch": 6.68780487804878, - "grad_norm": 2.0076377391815186, - "learning_rate": 1.238956604925934e-06, - "loss": 0.016, - "step": 1371 - }, - { - "epoch": 6.692682926829268, - "grad_norm": 4.137681484222412, - "learning_rate": 1.2356499900361333e-06, - "loss": 0.0557, - "step": 1372 - }, - { - "epoch": 6.697560975609756, - "grad_norm": 2.0039637088775635, - "learning_rate": 1.2323463444817227e-06, - "loss": 0.0219, - "step": 1373 - }, - { - "epoch": 6.702439024390244, - "grad_norm": 2.943314552307129, - "learning_rate": 1.2290456760213405e-06, - "loss": 0.0849, - "step": 1374 - }, - { - "epoch": 6.7073170731707314, - "grad_norm": 2.715120553970337, - "learning_rate": 1.2257479924066296e-06, - "loss": 0.0857, - "step": 1375 - }, - { - "epoch": 6.712195121951219, - "grad_norm": 3.144104480743408, - "learning_rate": 1.2224533013822237e-06, - "loss": 0.0648, - "step": 1376 - }, - { - "epoch": 6.717073170731707, - "grad_norm": 2.830066680908203, - "learning_rate": 1.2191616106857312e-06, - "loss": 0.0426, - "step": 1377 - }, - { - "epoch": 6.721951219512195, - "grad_norm": 3.1005899906158447, - "learning_rate": 1.2158729280477112e-06, - "loss": 0.0478, - "step": 1378 - }, - { - "epoch": 6.726829268292683, - "grad_norm": 2.2102460861206055, - "learning_rate": 1.2125872611916578e-06, - "loss": 0.0273, - "step": 1379 - }, - { - "epoch": 6.7317073170731705, - "grad_norm": 2.860288619995117, - "learning_rate": 1.2093046178339869e-06, - "loss": 0.0201, - "step": 1380 - }, - { - "epoch": 6.736585365853658, - "grad_norm": 1.5914067029953003, - "learning_rate": 1.206025005684009e-06, - "loss": 0.0148, - "step": 1381 - }, - { - "epoch": 6.741463414634146, - "grad_norm": 1.8609223365783691, - "learning_rate": 1.202748432443918e-06, - "loss": 0.0073, - "step": 1382 - }, - { - "epoch": 6.746341463414634, - "grad_norm": 3.0532407760620117, - "learning_rate": 1.1994749058087695e-06, - "loss": 0.0344, - "step": 1383 - }, - { - "epoch": 6.751219512195122, - "grad_norm": 4.0601677894592285, - "learning_rate": 1.196204433466467e-06, - "loss": 0.0837, - "step": 1384 - }, - { - "epoch": 6.7560975609756095, - "grad_norm": 2.6982672214508057, - "learning_rate": 1.192937023097738e-06, - "loss": 0.0425, - "step": 1385 - }, - { - "epoch": 6.760975609756097, - "grad_norm": 1.431360125541687, - "learning_rate": 1.1896726823761195e-06, - "loss": 0.0065, - "step": 1386 - }, - { - "epoch": 6.765853658536585, - "grad_norm": 2.116907835006714, - "learning_rate": 1.1864114189679413e-06, - "loss": 0.0133, - "step": 1387 - }, - { - "epoch": 6.770731707317073, - "grad_norm": 2.6869874000549316, - "learning_rate": 1.183153240532304e-06, - "loss": 0.0188, - "step": 1388 - }, - { - "epoch": 6.775609756097561, - "grad_norm": 2.0294089317321777, - "learning_rate": 1.179898154721063e-06, - "loss": 0.0234, - "step": 1389 - }, - { - "epoch": 6.780487804878049, - "grad_norm": 2.3081958293914795, - "learning_rate": 1.1766461691788137e-06, - "loss": 0.0208, - "step": 1390 - }, - { - "epoch": 6.785365853658536, - "grad_norm": 3.4795000553131104, - "learning_rate": 1.1733972915428665e-06, - "loss": 0.0728, - "step": 1391 - }, - { - "epoch": 6.790243902439024, - "grad_norm": 2.5121219158172607, - "learning_rate": 1.1701515294432348e-06, - "loss": 0.0291, - "step": 1392 - }, - { - "epoch": 6.795121951219512, - "grad_norm": 5.1100172996521, - "learning_rate": 1.1669088905026156e-06, - "loss": 0.0988, - "step": 1393 - }, - { - "epoch": 6.8, - "grad_norm": 2.5434396266937256, - "learning_rate": 1.163669382336371e-06, - "loss": 0.0399, - "step": 1394 - }, - { - "epoch": 6.804878048780488, - "grad_norm": 2.7811660766601562, - "learning_rate": 1.160433012552508e-06, - "loss": 0.0134, - "step": 1395 - }, - { - "epoch": 6.809756097560975, - "grad_norm": 3.2409870624542236, - "learning_rate": 1.1571997887516672e-06, - "loss": 0.0795, - "step": 1396 - }, - { - "epoch": 6.814634146341463, - "grad_norm": 2.5300986766815186, - "learning_rate": 1.1539697185270982e-06, - "loss": 0.0329, - "step": 1397 - }, - { - "epoch": 6.819512195121951, - "grad_norm": 1.8510549068450928, - "learning_rate": 1.1507428094646448e-06, - "loss": 0.0213, - "step": 1398 - }, - { - "epoch": 6.824390243902439, - "grad_norm": 1.8820618391036987, - "learning_rate": 1.1475190691427255e-06, - "loss": 0.0172, - "step": 1399 - }, - { - "epoch": 6.829268292682927, - "grad_norm": 1.3415460586547852, - "learning_rate": 1.1442985051323205e-06, - "loss": 0.0029, - "step": 1400 - }, - { - "epoch": 6.8341463414634145, - "grad_norm": 6.033786296844482, - "learning_rate": 1.1410811249969475e-06, - "loss": 0.1638, - "step": 1401 - }, - { - "epoch": 6.839024390243902, - "grad_norm": 2.990328311920166, - "learning_rate": 1.1378669362926468e-06, - "loss": 0.0779, - "step": 1402 - }, - { - "epoch": 6.84390243902439, - "grad_norm": 3.2766308784484863, - "learning_rate": 1.1346559465679656e-06, - "loss": 0.0528, - "step": 1403 - }, - { - "epoch": 6.848780487804878, - "grad_norm": 1.266032338142395, - "learning_rate": 1.1314481633639374e-06, - "loss": 0.0057, - "step": 1404 - }, - { - "epoch": 6.853658536585366, - "grad_norm": 3.1048431396484375, - "learning_rate": 1.1282435942140632e-06, - "loss": 0.1772, - "step": 1405 - }, - { - "epoch": 6.8585365853658535, - "grad_norm": 2.264822483062744, - "learning_rate": 1.1250422466442992e-06, - "loss": 0.0176, - "step": 1406 - }, - { - "epoch": 6.863414634146341, - "grad_norm": 2.0890846252441406, - "learning_rate": 1.1218441281730334e-06, - "loss": 0.0184, - "step": 1407 - }, - { - "epoch": 6.868292682926829, - "grad_norm": 1.8351202011108398, - "learning_rate": 1.1186492463110696e-06, - "loss": 0.0127, - "step": 1408 - }, - { - "epoch": 6.873170731707317, - "grad_norm": 1.447196125984192, - "learning_rate": 1.1154576085616135e-06, - "loss": 0.0094, - "step": 1409 - }, - { - "epoch": 6.878048780487805, - "grad_norm": 1.6414039134979248, - "learning_rate": 1.1122692224202491e-06, - "loss": 0.0138, - "step": 1410 - }, - { - "epoch": 6.882926829268293, - "grad_norm": 2.87068772315979, - "learning_rate": 1.1090840953749253e-06, - "loss": 0.0821, - "step": 1411 - }, - { - "epoch": 6.88780487804878, - "grad_norm": 2.0476415157318115, - "learning_rate": 1.1059022349059362e-06, - "loss": 0.0222, - "step": 1412 - }, - { - "epoch": 6.892682926829268, - "grad_norm": 4.169386863708496, - "learning_rate": 1.102723648485905e-06, - "loss": 0.1183, - "step": 1413 - }, - { - "epoch": 6.897560975609756, - "grad_norm": 4.47883415222168, - "learning_rate": 1.0995483435797643e-06, - "loss": 0.0528, - "step": 1414 - }, - { - "epoch": 6.902439024390244, - "grad_norm": 2.0025508403778076, - "learning_rate": 1.0963763276447435e-06, - "loss": 0.0106, - "step": 1415 - }, - { - "epoch": 6.907317073170732, - "grad_norm": 2.4212136268615723, - "learning_rate": 1.0932076081303442e-06, - "loss": 0.0454, - "step": 1416 - }, - { - "epoch": 6.912195121951219, - "grad_norm": 1.7873961925506592, - "learning_rate": 1.0900421924783272e-06, - "loss": 0.022, - "step": 1417 - }, - { - "epoch": 6.917073170731707, - "grad_norm": 2.0345218181610107, - "learning_rate": 1.0868800881226962e-06, - "loss": 0.0261, - "step": 1418 - }, - { - "epoch": 6.921951219512195, - "grad_norm": 3.086538314819336, - "learning_rate": 1.0837213024896764e-06, - "loss": 0.0257, - "step": 1419 - }, - { - "epoch": 6.926829268292683, - "grad_norm": 2.9401397705078125, - "learning_rate": 1.080565842997698e-06, - "loss": 0.087, - "step": 1420 - }, - { - "epoch": 6.931707317073171, - "grad_norm": 1.305415153503418, - "learning_rate": 1.0774137170573826e-06, - "loss": 0.0147, - "step": 1421 - }, - { - "epoch": 6.9365853658536585, - "grad_norm": 3.0256683826446533, - "learning_rate": 1.074264932071521e-06, - "loss": 0.1183, - "step": 1422 - }, - { - "epoch": 6.941463414634146, - "grad_norm": 2.3618743419647217, - "learning_rate": 1.0711194954350568e-06, - "loss": 0.0186, - "step": 1423 - }, - { - "epoch": 6.946341463414634, - "grad_norm": 2.004451036453247, - "learning_rate": 1.0679774145350735e-06, - "loss": 0.0222, - "step": 1424 - }, - { - "epoch": 6.951219512195122, - "grad_norm": 3.089723587036133, - "learning_rate": 1.0648386967507703e-06, - "loss": 0.0824, - "step": 1425 - }, - { - "epoch": 6.95609756097561, - "grad_norm": 1.9310235977172852, - "learning_rate": 1.0617033494534486e-06, - "loss": 0.0247, - "step": 1426 - }, - { - "epoch": 6.9609756097560975, - "grad_norm": 1.973836898803711, - "learning_rate": 1.0585713800064964e-06, - "loss": 0.0142, - "step": 1427 - }, - { - "epoch": 6.965853658536585, - "grad_norm": 2.9914112091064453, - "learning_rate": 1.0554427957653663e-06, - "loss": 0.0681, - "step": 1428 - }, - { - "epoch": 6.970731707317073, - "grad_norm": 3.356689691543579, - "learning_rate": 1.0523176040775615e-06, - "loss": 0.0916, - "step": 1429 - }, - { - "epoch": 6.975609756097561, - "grad_norm": 2.3305246829986572, - "learning_rate": 1.0491958122826173e-06, - "loss": 0.0611, - "step": 1430 - }, - { - "epoch": 6.980487804878049, - "grad_norm": 1.7383835315704346, - "learning_rate": 1.0460774277120866e-06, - "loss": 0.0182, - "step": 1431 - }, - { - "epoch": 6.985365853658537, - "grad_norm": 2.585674524307251, - "learning_rate": 1.0429624576895177e-06, - "loss": 0.0084, - "step": 1432 - }, - { - "epoch": 6.990243902439024, - "grad_norm": 3.023864269256592, - "learning_rate": 1.03985090953044e-06, - "loss": 0.0411, - "step": 1433 - }, - { - "epoch": 6.995121951219512, - "grad_norm": 2.281674861907959, - "learning_rate": 1.0367427905423497e-06, - "loss": 0.0464, - "step": 1434 - }, - { - "epoch": 7.0, - "grad_norm": 1.4372339248657227, - "learning_rate": 1.0336381080246858e-06, - "loss": 0.0124, - "step": 1435 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.128784138507387e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-1640/chat_template.jinja b/metallama3_8b/limo/checkpoint-1640/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-1640/config.json b/metallama3_8b/limo/checkpoint-1640/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-1640/generation_config.json b/metallama3_8b/limo/checkpoint-1640/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-1640/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1640/model-00001-of-00007.safetensors deleted file mode 100644 index 5fbffddf0c9076ab0c334b57ebe5b05287e4ba6c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8e4fd7142eaa037d10ec7de58d8f35d24249cb9bd33556d1f2ffc919e3f881d9 -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-1640/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1640/model-00002-of-00007.safetensors deleted file mode 100644 index 3549af844f5d6564cb3f6299542aec4b31c11e04..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3053d71cda17091f2eaa4870f4ce4d34ab4327125372fb3efd27eafa8c8715bf -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-1640/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1640/model-00003-of-00007.safetensors deleted file mode 100644 index 295424b33af575ed9e046db3162ded0b79351f76..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8b95aa74f561c831fff2aba7262e881b710f1d1e890add916f333b757cf9d14 -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-1640/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1640/model-00004-of-00007.safetensors deleted file mode 100644 index b305f80c662f41879f3e5a5216ed6d013b0acb50..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ed8473da3dc6a6b2db40e9b2cefd6362cb72bfb810b0be5c8138f17f149f5dd7 -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-1640/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1640/model-00005-of-00007.safetensors deleted file mode 100644 index 78e599eeb1f8141d8389a41958fba746feec8bfa..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff33ce7a4b62dd160bf680aa1c9e948db1350b576ab34a33e036cc3cf42e4f18 -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-1640/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1640/model-00006-of-00007.safetensors deleted file mode 100644 index a917fbd9223fbd5f5c6c3c05cb21500ee4f6e644..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d189eea2dce475d47a0239e215cf452520c1bc27e31edd76f0de9a8025d7b703 -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-1640/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1640/model-00007-of-00007.safetensors deleted file mode 100644 index fb2e680082d33095f89711bca93a7a6f305a0374..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a088378ea3e67711baabe092130a7be55e899791d831aa6893ac7d9caab34f3 -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-1640/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-1640/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-1640/rng_state_0.pth b/metallama3_8b/limo/checkpoint-1640/rng_state_0.pth deleted file mode 100644 index b4f7aff8787e77abdd3de7299719c4c21fc26258..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ee97cd82dba4d425fdd8dfdb88d4a43d0d4b1979b5c81ab4a24914fb00d4f332 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1640/rng_state_1.pth b/metallama3_8b/limo/checkpoint-1640/rng_state_1.pth deleted file mode 100644 index 60e171edb0868d2d1932468dd935beea673dfb02..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91dad95440fb85dc4a31745642117165c1a72173b2e389679ea8c0b2b6fcd7e2 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1640/rng_state_2.pth b/metallama3_8b/limo/checkpoint-1640/rng_state_2.pth deleted file mode 100644 index 719d1d591f4eba9f3f0ae8eb275150361dde6d12..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98698326b023c2af02c94f18726ce52c7f7a6fe290734dd7edbe99bc807fcfa0 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1640/rng_state_3.pth b/metallama3_8b/limo/checkpoint-1640/rng_state_3.pth deleted file mode 100644 index 45dc07c6b18b85ced4b0a4155cac795581cc18a5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:708e7c6b5bf8a327e688779ebc08830ce249928bcb1ff5c82b1b1d0bf6d2660b -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1640/scheduler.pt b/metallama3_8b/limo/checkpoint-1640/scheduler.pt deleted file mode 100644 index 1227d095e218ae9d7ef2c5f3c05922b6adaeb9dc..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:903c9c346dd4582f21a75bbe2d3bc832fd6d72ad0e56c2d9716143f026720edf -size 1064 diff --git a/metallama3_8b/limo/checkpoint-1640/special_tokens_map.json b/metallama3_8b/limo/checkpoint-1640/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-1640/tokenizer.json b/metallama3_8b/limo/checkpoint-1640/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-1640/tokenizer_config.json b/metallama3_8b/limo/checkpoint-1640/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-1640/trainer_state.json b/metallama3_8b/limo/checkpoint-1640/trainer_state.json deleted file mode 100644 index 3701406c9628adb1edfac34614fe5c88d4d9f771..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1640/trainer_state.json +++ /dev/null @@ -1,11514 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 8.0, - "eval_steps": 500, - "global_step": 1640, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - }, - { - "epoch": 3.004878048780488, - "grad_norm": 2.787429094314575, - "learning_rate": 3.969463130731183e-06, - "loss": 0.2403, - "step": 616 - }, - { - "epoch": 3.0097560975609756, - "grad_norm": 3.0412075519561768, - "learning_rate": 3.966361891975316e-06, - "loss": 0.2635, - "step": 617 - }, - { - "epoch": 3.0146341463414634, - "grad_norm": 2.9949851036071777, - "learning_rate": 3.963257209457652e-06, - "loss": 0.3294, - "step": 618 - }, - { - "epoch": 3.0195121951219512, - "grad_norm": 3.0510809421539307, - "learning_rate": 3.960149090469561e-06, - "loss": 0.1338, - "step": 619 - }, - { - "epoch": 3.024390243902439, - "grad_norm": 3.669482707977295, - "learning_rate": 3.957037542310484e-06, - "loss": 0.1469, - "step": 620 - }, - { - "epoch": 3.029268292682927, - "grad_norm": 4.677116870880127, - "learning_rate": 3.953922572287915e-06, - "loss": 0.2788, - "step": 621 - }, - { - "epoch": 3.0341463414634147, - "grad_norm": 4.33144474029541, - "learning_rate": 3.950804187717384e-06, - "loss": 0.4521, - "step": 622 - }, - { - "epoch": 3.0390243902439025, - "grad_norm": 3.466639757156372, - "learning_rate": 3.947682395922439e-06, - "loss": 0.5113, - "step": 623 - }, - { - "epoch": 3.0439024390243903, - "grad_norm": 3.2332122325897217, - "learning_rate": 3.9445572042346346e-06, - "loss": 0.0968, - "step": 624 - }, - { - "epoch": 3.048780487804878, - "grad_norm": 2.6108055114746094, - "learning_rate": 3.941428619993505e-06, - "loss": 0.2462, - "step": 625 - }, - { - "epoch": 3.053658536585366, - "grad_norm": 3.2512595653533936, - "learning_rate": 3.938296650546552e-06, - "loss": 0.1782, - "step": 626 - }, - { - "epoch": 3.0585365853658537, - "grad_norm": 3.4350366592407227, - "learning_rate": 3.935161303249231e-06, - "loss": 0.2955, - "step": 627 - }, - { - "epoch": 3.0634146341463415, - "grad_norm": 3.42012619972229, - "learning_rate": 3.932022585464928e-06, - "loss": 0.3259, - "step": 628 - }, - { - "epoch": 3.0682926829268293, - "grad_norm": 3.458043336868286, - "learning_rate": 3.928880504564943e-06, - "loss": 0.2306, - "step": 629 - }, - { - "epoch": 3.073170731707317, - "grad_norm": 2.646616220474243, - "learning_rate": 3.92573506792848e-06, - "loss": 0.2197, - "step": 630 - }, - { - "epoch": 3.078048780487805, - "grad_norm": 3.5558857917785645, - "learning_rate": 3.9225862829426184e-06, - "loss": 0.1607, - "step": 631 - }, - { - "epoch": 3.0829268292682928, - "grad_norm": 3.6011338233947754, - "learning_rate": 3.919434157002303e-06, - "loss": 0.3087, - "step": 632 - }, - { - "epoch": 3.0878048780487806, - "grad_norm": 2.339879035949707, - "learning_rate": 3.916278697510325e-06, - "loss": 0.2213, - "step": 633 - }, - { - "epoch": 3.0926829268292684, - "grad_norm": 3.268162488937378, - "learning_rate": 3.913119911877305e-06, - "loss": 0.318, - "step": 634 - }, - { - "epoch": 3.097560975609756, - "grad_norm": 4.062571048736572, - "learning_rate": 3.909957807521674e-06, - "loss": 0.1757, - "step": 635 - }, - { - "epoch": 3.102439024390244, - "grad_norm": 2.997659683227539, - "learning_rate": 3.906792391869657e-06, - "loss": 0.2391, - "step": 636 - }, - { - "epoch": 3.107317073170732, - "grad_norm": 3.7037394046783447, - "learning_rate": 3.903623672355258e-06, - "loss": 0.2548, - "step": 637 - }, - { - "epoch": 3.1121951219512196, - "grad_norm": 3.110579252243042, - "learning_rate": 3.900451656420237e-06, - "loss": 0.2389, - "step": 638 - }, - { - "epoch": 3.1170731707317074, - "grad_norm": 3.3332321643829346, - "learning_rate": 3.897276351514097e-06, - "loss": 0.1371, - "step": 639 - }, - { - "epoch": 3.1219512195121952, - "grad_norm": 3.8275935649871826, - "learning_rate": 3.894097765094065e-06, - "loss": 0.3363, - "step": 640 - }, - { - "epoch": 3.126829268292683, - "grad_norm": 2.3731374740600586, - "learning_rate": 3.890915904625075e-06, - "loss": 0.1314, - "step": 641 - }, - { - "epoch": 3.131707317073171, - "grad_norm": 3.1511282920837402, - "learning_rate": 3.887730777579751e-06, - "loss": 0.3563, - "step": 642 - }, - { - "epoch": 3.1365853658536587, - "grad_norm": 4.2254862785339355, - "learning_rate": 3.884542391438387e-06, - "loss": 0.5053, - "step": 643 - }, - { - "epoch": 3.1414634146341465, - "grad_norm": 4.579670429229736, - "learning_rate": 3.88135075368893e-06, - "loss": 0.6259, - "step": 644 - }, - { - "epoch": 3.1463414634146343, - "grad_norm": 3.2102746963500977, - "learning_rate": 3.878155871826968e-06, - "loss": 0.2599, - "step": 645 - }, - { - "epoch": 3.151219512195122, - "grad_norm": 2.5569686889648438, - "learning_rate": 3.874957753355701e-06, - "loss": 0.2075, - "step": 646 - }, - { - "epoch": 3.15609756097561, - "grad_norm": 3.588925838470459, - "learning_rate": 3.8717564057859365e-06, - "loss": 0.4577, - "step": 647 - }, - { - "epoch": 3.1609756097560977, - "grad_norm": 3.6163878440856934, - "learning_rate": 3.868551836636063e-06, - "loss": 0.4023, - "step": 648 - }, - { - "epoch": 3.1658536585365855, - "grad_norm": 3.8688390254974365, - "learning_rate": 3.865344053432035e-06, - "loss": 0.1669, - "step": 649 - }, - { - "epoch": 3.1707317073170733, - "grad_norm": 3.419734001159668, - "learning_rate": 3.862133063707353e-06, - "loss": 0.2766, - "step": 650 - }, - { - "epoch": 3.175609756097561, - "grad_norm": 2.9860243797302246, - "learning_rate": 3.858918875003053e-06, - "loss": 0.1788, - "step": 651 - }, - { - "epoch": 3.180487804878049, - "grad_norm": 3.0619022846221924, - "learning_rate": 3.855701494867679e-06, - "loss": 0.224, - "step": 652 - }, - { - "epoch": 3.1853658536585368, - "grad_norm": 3.3668978214263916, - "learning_rate": 3.852480930857275e-06, - "loss": 0.4029, - "step": 653 - }, - { - "epoch": 3.1902439024390246, - "grad_norm": 3.543147563934326, - "learning_rate": 3.849257190535356e-06, - "loss": 0.2096, - "step": 654 - }, - { - "epoch": 3.1951219512195124, - "grad_norm": 3.793619155883789, - "learning_rate": 3.846030281472902e-06, - "loss": 0.5574, - "step": 655 - }, - { - "epoch": 3.2, - "grad_norm": 3.021289110183716, - "learning_rate": 3.842800211248333e-06, - "loss": 0.2233, - "step": 656 - }, - { - "epoch": 3.204878048780488, - "grad_norm": 4.582934856414795, - "learning_rate": 3.839566987447492e-06, - "loss": 0.3871, - "step": 657 - }, - { - "epoch": 3.209756097560976, - "grad_norm": 2.996340274810791, - "learning_rate": 3.8363306176636296e-06, - "loss": 0.4325, - "step": 658 - }, - { - "epoch": 3.2146341463414636, - "grad_norm": 3.3190877437591553, - "learning_rate": 3.833091109497384e-06, - "loss": 0.5321, - "step": 659 - }, - { - "epoch": 3.2195121951219514, - "grad_norm": 3.2532856464385986, - "learning_rate": 3.829848470556765e-06, - "loss": 0.1359, - "step": 660 - }, - { - "epoch": 3.2243902439024392, - "grad_norm": 2.7875044345855713, - "learning_rate": 3.8266027084571335e-06, - "loss": 0.3145, - "step": 661 - }, - { - "epoch": 3.229268292682927, - "grad_norm": 3.748253583908081, - "learning_rate": 3.823353830821187e-06, - "loss": 0.1252, - "step": 662 - }, - { - "epoch": 3.234146341463415, - "grad_norm": 2.858293294906616, - "learning_rate": 3.820101845278937e-06, - "loss": 0.2589, - "step": 663 - }, - { - "epoch": 3.2390243902439027, - "grad_norm": 3.7470967769622803, - "learning_rate": 3.816846759467696e-06, - "loss": 0.2594, - "step": 664 - }, - { - "epoch": 3.2439024390243905, - "grad_norm": 3.676196813583374, - "learning_rate": 3.8135885810320587e-06, - "loss": 0.2998, - "step": 665 - }, - { - "epoch": 3.2487804878048783, - "grad_norm": 3.0943140983581543, - "learning_rate": 3.810327317623881e-06, - "loss": 0.2238, - "step": 666 - }, - { - "epoch": 3.253658536585366, - "grad_norm": 3.5907349586486816, - "learning_rate": 3.8070629769022628e-06, - "loss": 0.3381, - "step": 667 - }, - { - "epoch": 3.258536585365854, - "grad_norm": 3.1195285320281982, - "learning_rate": 3.8037955665335335e-06, - "loss": 0.2407, - "step": 668 - }, - { - "epoch": 3.2634146341463417, - "grad_norm": 3.422292947769165, - "learning_rate": 3.800525094191231e-06, - "loss": 0.2957, - "step": 669 - }, - { - "epoch": 3.2682926829268295, - "grad_norm": 2.5264663696289062, - "learning_rate": 3.797251567556083e-06, - "loss": 0.2493, - "step": 670 - }, - { - "epoch": 3.2731707317073173, - "grad_norm": 3.350219964981079, - "learning_rate": 3.793974994315991e-06, - "loss": 0.1186, - "step": 671 - }, - { - "epoch": 3.278048780487805, - "grad_norm": 4.175906181335449, - "learning_rate": 3.790695382166013e-06, - "loss": 0.3453, - "step": 672 - }, - { - "epoch": 3.2829268292682925, - "grad_norm": 3.006072521209717, - "learning_rate": 3.7874127388083415e-06, - "loss": 0.1981, - "step": 673 - }, - { - "epoch": 3.2878048780487803, - "grad_norm": 3.368561029434204, - "learning_rate": 3.7841270719522895e-06, - "loss": 0.2934, - "step": 674 - }, - { - "epoch": 3.292682926829268, - "grad_norm": 4.374331951141357, - "learning_rate": 3.7808383893142692e-06, - "loss": 0.1359, - "step": 675 - }, - { - "epoch": 3.297560975609756, - "grad_norm": 3.297102451324463, - "learning_rate": 3.7775466986177763e-06, - "loss": 0.2498, - "step": 676 - }, - { - "epoch": 3.3024390243902437, - "grad_norm": 2.8914761543273926, - "learning_rate": 3.774252007593371e-06, - "loss": 0.1308, - "step": 677 - }, - { - "epoch": 3.3073170731707315, - "grad_norm": 3.1550722122192383, - "learning_rate": 3.7709543239786593e-06, - "loss": 0.3915, - "step": 678 - }, - { - "epoch": 3.3121951219512193, - "grad_norm": 3.2302658557891846, - "learning_rate": 3.767653655518277e-06, - "loss": 0.2558, - "step": 679 - }, - { - "epoch": 3.317073170731707, - "grad_norm": 4.4321770668029785, - "learning_rate": 3.7643500099638673e-06, - "loss": 0.1988, - "step": 680 - }, - { - "epoch": 3.321951219512195, - "grad_norm": 2.970566749572754, - "learning_rate": 3.7610433950740667e-06, - "loss": 0.4908, - "step": 681 - }, - { - "epoch": 3.3268292682926828, - "grad_norm": 3.5516228675842285, - "learning_rate": 3.757733818614485e-06, - "loss": 0.304, - "step": 682 - }, - { - "epoch": 3.3317073170731706, - "grad_norm": 2.7555387020111084, - "learning_rate": 3.7544212883576856e-06, - "loss": 0.2533, - "step": 683 - }, - { - "epoch": 3.3365853658536584, - "grad_norm": 3.61226749420166, - "learning_rate": 3.751105812083172e-06, - "loss": 0.1771, - "step": 684 - }, - { - "epoch": 3.341463414634146, - "grad_norm": 3.0466206073760986, - "learning_rate": 3.7477873975773655e-06, - "loss": 0.4213, - "step": 685 - }, - { - "epoch": 3.346341463414634, - "grad_norm": 3.6091527938842773, - "learning_rate": 3.7444660526335853e-06, - "loss": 0.3808, - "step": 686 - }, - { - "epoch": 3.351219512195122, - "grad_norm": 3.8443002700805664, - "learning_rate": 3.741141785052036e-06, - "loss": 0.6438, - "step": 687 - }, - { - "epoch": 3.3560975609756096, - "grad_norm": 3.845909833908081, - "learning_rate": 3.737814602639784e-06, - "loss": 0.3686, - "step": 688 - }, - { - "epoch": 3.3609756097560974, - "grad_norm": 2.904892921447754, - "learning_rate": 3.7344845132107427e-06, - "loss": 0.2934, - "step": 689 - }, - { - "epoch": 3.3658536585365852, - "grad_norm": 3.4766387939453125, - "learning_rate": 3.731151524585651e-06, - "loss": 0.3299, - "step": 690 - }, - { - "epoch": 3.370731707317073, - "grad_norm": 4.236767768859863, - "learning_rate": 3.7278156445920584e-06, - "loss": 0.6303, - "step": 691 - }, - { - "epoch": 3.375609756097561, - "grad_norm": 3.1122591495513916, - "learning_rate": 3.724476881064303e-06, - "loss": 0.2432, - "step": 692 - }, - { - "epoch": 3.3804878048780487, - "grad_norm": 3.0971457958221436, - "learning_rate": 3.721135241843496e-06, - "loss": 0.3131, - "step": 693 - }, - { - "epoch": 3.3853658536585365, - "grad_norm": 3.9365804195404053, - "learning_rate": 3.7177907347775016e-06, - "loss": 0.3372, - "step": 694 - }, - { - "epoch": 3.3902439024390243, - "grad_norm": 3.760373115539551, - "learning_rate": 3.71444336772092e-06, - "loss": 0.5055, - "step": 695 - }, - { - "epoch": 3.395121951219512, - "grad_norm": 4.360848426818848, - "learning_rate": 3.711093148535068e-06, - "loss": 0.6183, - "step": 696 - }, - { - "epoch": 3.4, - "grad_norm": 3.7713537216186523, - "learning_rate": 3.707740085087959e-06, - "loss": 0.1568, - "step": 697 - }, - { - "epoch": 3.4048780487804877, - "grad_norm": 3.8532230854034424, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.2826, - "step": 698 - }, - { - "epoch": 3.4097560975609755, - "grad_norm": 3.0548605918884277, - "learning_rate": 3.701025456915411e-06, - "loss": 0.1918, - "step": 699 - }, - { - "epoch": 3.4146341463414633, - "grad_norm": 3.2431821823120117, - "learning_rate": 3.697663907959327e-06, - "loss": 0.2493, - "step": 700 - }, - { - "epoch": 3.419512195121951, - "grad_norm": 3.7301864624023438, - "learning_rate": 3.6942995462806574e-06, - "loss": 0.4913, - "step": 701 - }, - { - "epoch": 3.424390243902439, - "grad_norm": 2.5468900203704834, - "learning_rate": 3.6909323797806314e-06, - "loss": 0.1788, - "step": 702 - }, - { - "epoch": 3.4292682926829268, - "grad_norm": 3.3719515800476074, - "learning_rate": 3.6875624163670635e-06, - "loss": 0.4162, - "step": 703 - }, - { - "epoch": 3.4341463414634146, - "grad_norm": 3.528010368347168, - "learning_rate": 3.6841896639543394e-06, - "loss": 0.1924, - "step": 704 - }, - { - "epoch": 3.4390243902439024, - "grad_norm": 3.3636631965637207, - "learning_rate": 3.6808141304633924e-06, - "loss": 0.3177, - "step": 705 - }, - { - "epoch": 3.44390243902439, - "grad_norm": 3.418705463409424, - "learning_rate": 3.6774358238216878e-06, - "loss": 0.2301, - "step": 706 - }, - { - "epoch": 3.448780487804878, - "grad_norm": 4.720373630523682, - "learning_rate": 3.6740547519632048e-06, - "loss": 0.1894, - "step": 707 - }, - { - "epoch": 3.453658536585366, - "grad_norm": 2.9635703563690186, - "learning_rate": 3.670670922828414e-06, - "loss": 0.2642, - "step": 708 - }, - { - "epoch": 3.4585365853658536, - "grad_norm": 4.934754371643066, - "learning_rate": 3.667284344364264e-06, - "loss": 0.2275, - "step": 709 - }, - { - "epoch": 3.4634146341463414, - "grad_norm": 3.090585231781006, - "learning_rate": 3.6638950245241604e-06, - "loss": 0.4447, - "step": 710 - }, - { - "epoch": 3.4682926829268292, - "grad_norm": 4.360495090484619, - "learning_rate": 3.660502971267945e-06, - "loss": 0.2415, - "step": 711 - }, - { - "epoch": 3.473170731707317, - "grad_norm": 3.4893476963043213, - "learning_rate": 3.65710819256188e-06, - "loss": 0.0921, - "step": 712 - }, - { - "epoch": 3.478048780487805, - "grad_norm": 3.2423770427703857, - "learning_rate": 3.65371069637863e-06, - "loss": 0.2371, - "step": 713 - }, - { - "epoch": 3.4829268292682927, - "grad_norm": 3.0775890350341797, - "learning_rate": 3.650310490697238e-06, - "loss": 0.4026, - "step": 714 - }, - { - "epoch": 3.4878048780487805, - "grad_norm": 3.906625270843506, - "learning_rate": 3.646907583503114e-06, - "loss": 0.4312, - "step": 715 - }, - { - "epoch": 3.4926829268292683, - "grad_norm": 3.2140414714813232, - "learning_rate": 3.6435019827880093e-06, - "loss": 0.2309, - "step": 716 - }, - { - "epoch": 3.497560975609756, - "grad_norm": 3.048523426055908, - "learning_rate": 3.640093696550003e-06, - "loss": 0.296, - "step": 717 - }, - { - "epoch": 3.502439024390244, - "grad_norm": 2.9669039249420166, - "learning_rate": 3.6366827327934817e-06, - "loss": 0.2723, - "step": 718 - }, - { - "epoch": 3.5073170731707317, - "grad_norm": 3.6941726207733154, - "learning_rate": 3.6332690995291176e-06, - "loss": 0.3797, - "step": 719 - }, - { - "epoch": 3.5121951219512195, - "grad_norm": 5.135766506195068, - "learning_rate": 3.6298528047738545e-06, - "loss": 0.9868, - "step": 720 - }, - { - "epoch": 3.5170731707317073, - "grad_norm": 3.2021052837371826, - "learning_rate": 3.626433856550886e-06, - "loss": 0.4069, - "step": 721 - }, - { - "epoch": 3.521951219512195, - "grad_norm": 3.094444513320923, - "learning_rate": 3.623012262889637e-06, - "loss": 0.3368, - "step": 722 - }, - { - "epoch": 3.526829268292683, - "grad_norm": 3.609285354614258, - "learning_rate": 3.6195880318257465e-06, - "loss": 0.3972, - "step": 723 - }, - { - "epoch": 3.5317073170731708, - "grad_norm": 4.236501216888428, - "learning_rate": 3.616161171401046e-06, - "loss": 0.52, - "step": 724 - }, - { - "epoch": 3.5365853658536586, - "grad_norm": 3.504526376724243, - "learning_rate": 3.612731689663542e-06, - "loss": 0.23, - "step": 725 - }, - { - "epoch": 3.5414634146341464, - "grad_norm": 3.233591079711914, - "learning_rate": 3.6092995946673996e-06, - "loss": 0.4151, - "step": 726 - }, - { - "epoch": 3.546341463414634, - "grad_norm": 3.6701886653900146, - "learning_rate": 3.605864894472918e-06, - "loss": 0.2798, - "step": 727 - }, - { - "epoch": 3.551219512195122, - "grad_norm": 3.8713181018829346, - "learning_rate": 3.602427597146516e-06, - "loss": 0.4336, - "step": 728 - }, - { - "epoch": 3.55609756097561, - "grad_norm": 5.49612283706665, - "learning_rate": 3.5989877107607134e-06, - "loss": 0.4803, - "step": 729 - }, - { - "epoch": 3.5609756097560976, - "grad_norm": 3.771005392074585, - "learning_rate": 3.5955452433941075e-06, - "loss": 0.3698, - "step": 730 - }, - { - "epoch": 3.5658536585365854, - "grad_norm": 2.970822334289551, - "learning_rate": 3.5921002031313586e-06, - "loss": 0.2373, - "step": 731 - }, - { - "epoch": 3.5707317073170732, - "grad_norm": 3.517249584197998, - "learning_rate": 3.58865259806317e-06, - "loss": 0.1908, - "step": 732 - }, - { - "epoch": 3.575609756097561, - "grad_norm": 3.6825428009033203, - "learning_rate": 3.585202436286267e-06, - "loss": 0.3993, - "step": 733 - }, - { - "epoch": 3.580487804878049, - "grad_norm": 3.387479066848755, - "learning_rate": 3.581749725903381e-06, - "loss": 0.4237, - "step": 734 - }, - { - "epoch": 3.5853658536585367, - "grad_norm": 3.5004806518554688, - "learning_rate": 3.5782944750232274e-06, - "loss": 0.3011, - "step": 735 - }, - { - "epoch": 3.5902439024390245, - "grad_norm": 3.461731433868408, - "learning_rate": 3.574836691760489e-06, - "loss": 0.0896, - "step": 736 - }, - { - "epoch": 3.5951219512195123, - "grad_norm": 3.9598381519317627, - "learning_rate": 3.571376384235795e-06, - "loss": 0.2751, - "step": 737 - }, - { - "epoch": 3.6, - "grad_norm": 4.053933143615723, - "learning_rate": 3.5679135605757035e-06, - "loss": 0.2086, - "step": 738 - }, - { - "epoch": 3.604878048780488, - "grad_norm": 2.9683544635772705, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1659, - "step": 739 - }, - { - "epoch": 3.6097560975609757, - "grad_norm": 3.6598448753356934, - "learning_rate": 3.5609803973850877e-06, - "loss": 0.2469, - "step": 740 - }, - { - "epoch": 3.6146341463414635, - "grad_norm": 3.449335813522339, - "learning_rate": 3.557510074137147e-06, - "loss": 0.375, - "step": 741 - }, - { - "epoch": 3.6195121951219513, - "grad_norm": 2.7666923999786377, - "learning_rate": 3.554037267318942e-06, - "loss": 0.3133, - "step": 742 - }, - { - "epoch": 3.624390243902439, - "grad_norm": 2.8951869010925293, - "learning_rate": 3.5505619850863847e-06, - "loss": 0.2243, - "step": 743 - }, - { - "epoch": 3.629268292682927, - "grad_norm": 3.477747678756714, - "learning_rate": 3.5470842356012007e-06, - "loss": 0.1321, - "step": 744 - }, - { - "epoch": 3.6341463414634148, - "grad_norm": 3.810480833053589, - "learning_rate": 3.5436040270309113e-06, - "loss": 0.361, - "step": 745 - }, - { - "epoch": 3.6390243902439026, - "grad_norm": 3.0730793476104736, - "learning_rate": 3.540121367548811e-06, - "loss": 0.1523, - "step": 746 - }, - { - "epoch": 3.6439024390243904, - "grad_norm": 3.6878390312194824, - "learning_rate": 3.5366362653339524e-06, - "loss": 0.4898, - "step": 747 - }, - { - "epoch": 3.648780487804878, - "grad_norm": 3.6432242393493652, - "learning_rate": 3.533148728571124e-06, - "loss": 0.1397, - "step": 748 - }, - { - "epoch": 3.653658536585366, - "grad_norm": 3.7047760486602783, - "learning_rate": 3.5296587654508317e-06, - "loss": 0.323, - "step": 749 - }, - { - "epoch": 3.658536585365854, - "grad_norm": 3.777132749557495, - "learning_rate": 3.526166384169279e-06, - "loss": 0.5577, - "step": 750 - }, - { - "epoch": 3.6634146341463416, - "grad_norm": 3.7970924377441406, - "learning_rate": 3.5226715929283507e-06, - "loss": 0.245, - "step": 751 - }, - { - "epoch": 3.6682926829268294, - "grad_norm": 2.8203537464141846, - "learning_rate": 3.519174399935588e-06, - "loss": 0.1619, - "step": 752 - }, - { - "epoch": 3.6731707317073172, - "grad_norm": 3.4040987491607666, - "learning_rate": 3.5156748134041767e-06, - "loss": 0.1047, - "step": 753 - }, - { - "epoch": 3.678048780487805, - "grad_norm": 3.927960157394409, - "learning_rate": 3.5121728415529203e-06, - "loss": 0.5713, - "step": 754 - }, - { - "epoch": 3.682926829268293, - "grad_norm": 3.3833277225494385, - "learning_rate": 3.5086684926062266e-06, - "loss": 0.2174, - "step": 755 - }, - { - "epoch": 3.68780487804878, - "grad_norm": 3.989307403564453, - "learning_rate": 3.505161774794085e-06, - "loss": 0.285, - "step": 756 - }, - { - "epoch": 3.692682926829268, - "grad_norm": 2.742429494857788, - "learning_rate": 3.5016526963520474e-06, - "loss": 0.1602, - "step": 757 - }, - { - "epoch": 3.697560975609756, - "grad_norm": 3.7082698345184326, - "learning_rate": 3.498141265521212e-06, - "loss": 0.666, - "step": 758 - }, - { - "epoch": 3.7024390243902436, - "grad_norm": 3.033196210861206, - "learning_rate": 3.4946274905481997e-06, - "loss": 0.2024, - "step": 759 - }, - { - "epoch": 3.7073170731707314, - "grad_norm": 3.7145371437072754, - "learning_rate": 3.4911113796851364e-06, - "loss": 0.2719, - "step": 760 - }, - { - "epoch": 3.7121951219512193, - "grad_norm": 3.580298900604248, - "learning_rate": 3.487592941189636e-06, - "loss": 0.1537, - "step": 761 - }, - { - "epoch": 3.717073170731707, - "grad_norm": 4.753757953643799, - "learning_rate": 3.484072183324776e-06, - "loss": 0.6149, - "step": 762 - }, - { - "epoch": 3.721951219512195, - "grad_norm": 3.5575687885284424, - "learning_rate": 3.4805491143590823e-06, - "loss": 0.4241, - "step": 763 - }, - { - "epoch": 3.7268292682926827, - "grad_norm": 3.215224266052246, - "learning_rate": 3.4770237425665103e-06, - "loss": 0.3037, - "step": 764 - }, - { - "epoch": 3.7317073170731705, - "grad_norm": 2.9899685382843018, - "learning_rate": 3.4734960762264204e-06, - "loss": 0.4854, - "step": 765 - }, - { - "epoch": 3.7365853658536583, - "grad_norm": 3.5880227088928223, - "learning_rate": 3.469966123623563e-06, - "loss": 0.3849, - "step": 766 - }, - { - "epoch": 3.741463414634146, - "grad_norm": 3.472750186920166, - "learning_rate": 3.46643389304806e-06, - "loss": 0.3159, - "step": 767 - }, - { - "epoch": 3.746341463414634, - "grad_norm": 4.355650901794434, - "learning_rate": 3.4628993927953786e-06, - "loss": 0.7527, - "step": 768 - }, - { - "epoch": 3.7512195121951217, - "grad_norm": 2.94575834274292, - "learning_rate": 3.45936263116632e-06, - "loss": 0.1716, - "step": 769 - }, - { - "epoch": 3.7560975609756095, - "grad_norm": 2.991525173187256, - "learning_rate": 3.4558236164669957e-06, - "loss": 0.2061, - "step": 770 - }, - { - "epoch": 3.7609756097560973, - "grad_norm": 3.134000301361084, - "learning_rate": 3.4522823570088073e-06, - "loss": 0.1338, - "step": 771 - }, - { - "epoch": 3.765853658536585, - "grad_norm": 3.722140312194824, - "learning_rate": 3.4487388611084295e-06, - "loss": 0.2615, - "step": 772 - }, - { - "epoch": 3.770731707317073, - "grad_norm": 3.7941153049468994, - "learning_rate": 3.445193137087788e-06, - "loss": 0.1401, - "step": 773 - }, - { - "epoch": 3.7756097560975608, - "grad_norm": 2.872941732406616, - "learning_rate": 3.4416451932740424e-06, - "loss": 0.2934, - "step": 774 - }, - { - "epoch": 3.7804878048780486, - "grad_norm": 4.5019941329956055, - "learning_rate": 3.4380950379995652e-06, - "loss": 0.4579, - "step": 775 - }, - { - "epoch": 3.7853658536585364, - "grad_norm": 2.682884931564331, - "learning_rate": 3.434542679601922e-06, - "loss": 0.2979, - "step": 776 - }, - { - "epoch": 3.790243902439024, - "grad_norm": 3.3044273853302, - "learning_rate": 3.4309881264238538e-06, - "loss": 0.1196, - "step": 777 - }, - { - "epoch": 3.795121951219512, - "grad_norm": 3.102760076522827, - "learning_rate": 3.4274313868132547e-06, - "loss": 0.2026, - "step": 778 - }, - { - "epoch": 3.8, - "grad_norm": 3.3304500579833984, - "learning_rate": 3.4238724691231534e-06, - "loss": 0.2135, - "step": 779 - }, - { - "epoch": 3.8048780487804876, - "grad_norm": 3.295119047164917, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.4418, - "step": 780 - }, - { - "epoch": 3.8097560975609754, - "grad_norm": 3.6655640602111816, - "learning_rate": 3.4167481329421204e-06, - "loss": 0.203, - "step": 781 - }, - { - "epoch": 3.8146341463414632, - "grad_norm": 3.387830972671509, - "learning_rate": 3.4131827311827447e-06, - "loss": 0.3225, - "step": 782 - }, - { - "epoch": 3.819512195121951, - "grad_norm": 2.621633529663086, - "learning_rate": 3.4096151848069416e-06, - "loss": 0.1704, - "step": 783 - }, - { - "epoch": 3.824390243902439, - "grad_norm": 2.974344491958618, - "learning_rate": 3.4060455021931195e-06, - "loss": 0.2785, - "step": 784 - }, - { - "epoch": 3.8292682926829267, - "grad_norm": 3.452131748199463, - "learning_rate": 3.402473691724704e-06, - "loss": 0.223, - "step": 785 - }, - { - "epoch": 3.8341463414634145, - "grad_norm": 2.6373705863952637, - "learning_rate": 3.39889976179012e-06, - "loss": 0.2368, - "step": 786 - }, - { - "epoch": 3.8390243902439023, - "grad_norm": 2.863184928894043, - "learning_rate": 3.3953237207827673e-06, - "loss": 0.3294, - "step": 787 - }, - { - "epoch": 3.84390243902439, - "grad_norm": 5.104704856872559, - "learning_rate": 3.391745577101005e-06, - "loss": 0.5431, - "step": 788 - }, - { - "epoch": 3.848780487804878, - "grad_norm": 3.951310634613037, - "learning_rate": 3.3881653391481306e-06, - "loss": 0.2546, - "step": 789 - }, - { - "epoch": 3.8536585365853657, - "grad_norm": 3.9903225898742676, - "learning_rate": 3.384583015332359e-06, - "loss": 0.3293, - "step": 790 - }, - { - "epoch": 3.8585365853658535, - "grad_norm": 3.3149220943450928, - "learning_rate": 3.380998614066805e-06, - "loss": 0.1861, - "step": 791 - }, - { - "epoch": 3.8634146341463413, - "grad_norm": 3.6755223274230957, - "learning_rate": 3.3774121437694606e-06, - "loss": 0.2498, - "step": 792 - }, - { - "epoch": 3.868292682926829, - "grad_norm": 3.192918300628662, - "learning_rate": 3.3738236128631786e-06, - "loss": 0.1525, - "step": 793 - }, - { - "epoch": 3.873170731707317, - "grad_norm": 3.5358777046203613, - "learning_rate": 3.3702330297756503e-06, - "loss": 0.3622, - "step": 794 - }, - { - "epoch": 3.8780487804878048, - "grad_norm": 3.619878053665161, - "learning_rate": 3.366640402939387e-06, - "loss": 0.1051, - "step": 795 - }, - { - "epoch": 3.8829268292682926, - "grad_norm": 7.085352420806885, - "learning_rate": 3.363045740791698e-06, - "loss": 0.4606, - "step": 796 - }, - { - "epoch": 3.8878048780487804, - "grad_norm": 2.523165464401245, - "learning_rate": 3.3594490517746774e-06, - "loss": 0.2267, - "step": 797 - }, - { - "epoch": 3.892682926829268, - "grad_norm": 2.7026922702789307, - "learning_rate": 3.3558503443351733e-06, - "loss": 0.2792, - "step": 798 - }, - { - "epoch": 3.897560975609756, - "grad_norm": 2.9232428073883057, - "learning_rate": 3.352249626924777e-06, - "loss": 0.2579, - "step": 799 - }, - { - "epoch": 3.902439024390244, - "grad_norm": 4.760788440704346, - "learning_rate": 3.348646907999801e-06, - "loss": 0.6983, - "step": 800 - }, - { - "epoch": 3.9073170731707316, - "grad_norm": 3.198249578475952, - "learning_rate": 3.345042196021257e-06, - "loss": 0.3265, - "step": 801 - }, - { - "epoch": 3.9121951219512194, - "grad_norm": 4.069286823272705, - "learning_rate": 3.3414354994548385e-06, - "loss": 0.497, - "step": 802 - }, - { - "epoch": 3.9170731707317072, - "grad_norm": 3.4435410499572754, - "learning_rate": 3.337826826770898e-06, - "loss": 0.2812, - "step": 803 - }, - { - "epoch": 3.921951219512195, - "grad_norm": 3.9805212020874023, - "learning_rate": 3.3342161864444312e-06, - "loss": 0.2277, - "step": 804 - }, - { - "epoch": 3.926829268292683, - "grad_norm": 3.348925828933716, - "learning_rate": 3.3306035869550534e-06, - "loss": 0.1614, - "step": 805 - }, - { - "epoch": 3.9317073170731707, - "grad_norm": 4.7613701820373535, - "learning_rate": 3.326989036786981e-06, - "loss": 0.3269, - "step": 806 - }, - { - "epoch": 3.9365853658536585, - "grad_norm": 3.807502508163452, - "learning_rate": 3.3233725444290126e-06, - "loss": 0.2619, - "step": 807 - }, - { - "epoch": 3.9414634146341463, - "grad_norm": 3.2690203189849854, - "learning_rate": 3.3197541183745065e-06, - "loss": 0.4334, - "step": 808 - }, - { - "epoch": 3.946341463414634, - "grad_norm": 3.396993398666382, - "learning_rate": 3.3161337671213634e-06, - "loss": 0.2738, - "step": 809 - }, - { - "epoch": 3.951219512195122, - "grad_norm": 3.086669921875, - "learning_rate": 3.312511499172006e-06, - "loss": 0.1597, - "step": 810 - }, - { - "epoch": 3.9560975609756097, - "grad_norm": 3.5688745975494385, - "learning_rate": 3.3088873230333562e-06, - "loss": 0.3195, - "step": 811 - }, - { - "epoch": 3.9609756097560975, - "grad_norm": 3.4843621253967285, - "learning_rate": 3.3052612472168193e-06, - "loss": 0.1865, - "step": 812 - }, - { - "epoch": 3.9658536585365853, - "grad_norm": 2.8479580879211426, - "learning_rate": 3.3016332802382618e-06, - "loss": 0.3108, - "step": 813 - }, - { - "epoch": 3.970731707317073, - "grad_norm": 3.3241543769836426, - "learning_rate": 3.2980034306179897e-06, - "loss": 0.2099, - "step": 814 - }, - { - "epoch": 3.975609756097561, - "grad_norm": 2.817675828933716, - "learning_rate": 3.294371706880733e-06, - "loss": 0.3073, - "step": 815 - }, - { - "epoch": 3.9804878048780488, - "grad_norm": 2.9535388946533203, - "learning_rate": 3.290738117555622e-06, - "loss": 0.2024, - "step": 816 - }, - { - "epoch": 3.9853658536585366, - "grad_norm": 5.021281719207764, - "learning_rate": 3.2871026711761666e-06, - "loss": 0.508, - "step": 817 - }, - { - "epoch": 3.9902439024390244, - "grad_norm": 3.3377649784088135, - "learning_rate": 3.2834653762802414e-06, - "loss": 0.2116, - "step": 818 - }, - { - "epoch": 3.995121951219512, - "grad_norm": 4.412073135375977, - "learning_rate": 3.2798262414100594e-06, - "loss": 0.2177, - "step": 819 - }, - { - "epoch": 4.0, - "grad_norm": 3.174323797225952, - "learning_rate": 3.2761852751121566e-06, - "loss": 0.1737, - "step": 820 - }, - { - "epoch": 4.004878048780488, - "grad_norm": 2.921494960784912, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2569, - "step": 821 - }, - { - "epoch": 4.009756097560976, - "grad_norm": 2.693495512008667, - "learning_rate": 3.2688978824408136e-06, - "loss": 0.1621, - "step": 822 - }, - { - "epoch": 4.014634146341463, - "grad_norm": 2.705796718597412, - "learning_rate": 3.2652514731818698e-06, - "loss": 0.1121, - "step": 823 - }, - { - "epoch": 4.019512195121951, - "grad_norm": 3.2621448040008545, - "learning_rate": 3.2616032667241564e-06, - "loss": 0.0835, - "step": 824 - }, - { - "epoch": 4.024390243902439, - "grad_norm": 3.6205084323883057, - "learning_rate": 3.257953271635513e-06, - "loss": 0.3731, - "step": 825 - }, - { - "epoch": 4.029268292682927, - "grad_norm": 3.2600371837615967, - "learning_rate": 3.2543014964879814e-06, - "loss": 0.1051, - "step": 826 - }, - { - "epoch": 4.034146341463415, - "grad_norm": 3.865178346633911, - "learning_rate": 3.250647949857781e-06, - "loss": 0.0916, - "step": 827 - }, - { - "epoch": 4.0390243902439025, - "grad_norm": 6.9700927734375, - "learning_rate": 3.2469926403252932e-06, - "loss": 0.4037, - "step": 828 - }, - { - "epoch": 4.04390243902439, - "grad_norm": 3.658712148666382, - "learning_rate": 3.2433355764750417e-06, - "loss": 0.0523, - "step": 829 - }, - { - "epoch": 4.048780487804878, - "grad_norm": 4.911301612854004, - "learning_rate": 3.2396767668956656e-06, - "loss": 0.2616, - "step": 830 - }, - { - "epoch": 4.053658536585366, - "grad_norm": 5.019360542297363, - "learning_rate": 3.2360162201799085e-06, - "loss": 0.195, - "step": 831 - }, - { - "epoch": 4.058536585365854, - "grad_norm": 3.493767261505127, - "learning_rate": 3.2323539449245906e-06, - "loss": 0.1245, - "step": 832 - }, - { - "epoch": 4.0634146341463415, - "grad_norm": 4.246248722076416, - "learning_rate": 3.2286899497305917e-06, - "loss": 0.1147, - "step": 833 - }, - { - "epoch": 4.068292682926829, - "grad_norm": 2.993704319000244, - "learning_rate": 3.2250242432028335e-06, - "loss": 0.2189, - "step": 834 - }, - { - "epoch": 4.073170731707317, - "grad_norm": 4.695023059844971, - "learning_rate": 3.221356833950254e-06, - "loss": 0.4685, - "step": 835 - }, - { - "epoch": 4.078048780487805, - "grad_norm": 2.777644634246826, - "learning_rate": 3.21768773058579e-06, - "loss": 0.1245, - "step": 836 - }, - { - "epoch": 4.082926829268293, - "grad_norm": 3.3545901775360107, - "learning_rate": 3.21401694172636e-06, - "loss": 0.1342, - "step": 837 - }, - { - "epoch": 4.087804878048781, - "grad_norm": 2.2222652435302734, - "learning_rate": 3.2103444759928383e-06, - "loss": 0.0484, - "step": 838 - }, - { - "epoch": 4.092682926829268, - "grad_norm": 2.580345630645752, - "learning_rate": 3.2066703420100377e-06, - "loss": 0.0592, - "step": 839 - }, - { - "epoch": 4.097560975609756, - "grad_norm": 3.8652923107147217, - "learning_rate": 3.2029945484066883e-06, - "loss": 0.2536, - "step": 840 - }, - { - "epoch": 4.102439024390244, - "grad_norm": 3.0441582202911377, - "learning_rate": 3.1993171038154203e-06, - "loss": 0.1221, - "step": 841 - }, - { - "epoch": 4.107317073170732, - "grad_norm": 2.2795114517211914, - "learning_rate": 3.1956380168727385e-06, - "loss": 0.1231, - "step": 842 - }, - { - "epoch": 4.11219512195122, - "grad_norm": 3.701009750366211, - "learning_rate": 3.191957296219007e-06, - "loss": 0.2144, - "step": 843 - }, - { - "epoch": 4.117073170731707, - "grad_norm": 3.452637195587158, - "learning_rate": 3.1882749504984247e-06, - "loss": 0.1026, - "step": 844 - }, - { - "epoch": 4.121951219512195, - "grad_norm": 2.4208810329437256, - "learning_rate": 3.1845909883590076e-06, - "loss": 0.1124, - "step": 845 - }, - { - "epoch": 4.126829268292683, - "grad_norm": 4.353063583374023, - "learning_rate": 3.180905418452569e-06, - "loss": 0.2804, - "step": 846 - }, - { - "epoch": 4.131707317073171, - "grad_norm": 3.1151084899902344, - "learning_rate": 3.1772182494346963e-06, - "loss": 0.1748, - "step": 847 - }, - { - "epoch": 4.136585365853659, - "grad_norm": 3.457940101623535, - "learning_rate": 3.1735294899647344e-06, - "loss": 0.1984, - "step": 848 - }, - { - "epoch": 4.1414634146341465, - "grad_norm": 3.3556935787200928, - "learning_rate": 3.169839148705762e-06, - "loss": 0.1332, - "step": 849 - }, - { - "epoch": 4.146341463414634, - "grad_norm": 3.5510823726654053, - "learning_rate": 3.1661472343245725e-06, - "loss": 0.4788, - "step": 850 - }, - { - "epoch": 4.151219512195122, - "grad_norm": 4.036712646484375, - "learning_rate": 3.162453755491655e-06, - "loss": 0.2437, - "step": 851 - }, - { - "epoch": 4.15609756097561, - "grad_norm": 4.417062282562256, - "learning_rate": 3.158758720881171e-06, - "loss": 0.203, - "step": 852 - }, - { - "epoch": 4.160975609756098, - "grad_norm": 3.920558214187622, - "learning_rate": 3.155062139170937e-06, - "loss": 0.1462, - "step": 853 - }, - { - "epoch": 4.1658536585365855, - "grad_norm": 6.472081661224365, - "learning_rate": 3.1513640190424034e-06, - "loss": 0.0972, - "step": 854 - }, - { - "epoch": 4.170731707317073, - "grad_norm": 3.975947141647339, - "learning_rate": 3.147664369180632e-06, - "loss": 0.1092, - "step": 855 - }, - { - "epoch": 4.175609756097561, - "grad_norm": 4.977376937866211, - "learning_rate": 3.143963198274278e-06, - "loss": 0.2215, - "step": 856 - }, - { - "epoch": 4.180487804878049, - "grad_norm": 3.595460891723633, - "learning_rate": 3.140260515015569e-06, - "loss": 0.1771, - "step": 857 - }, - { - "epoch": 4.185365853658537, - "grad_norm": 3.1085658073425293, - "learning_rate": 3.136556328100284e-06, - "loss": 0.1995, - "step": 858 - }, - { - "epoch": 4.190243902439025, - "grad_norm": 4.355626583099365, - "learning_rate": 3.132850646227734e-06, - "loss": 0.4048, - "step": 859 - }, - { - "epoch": 4.195121951219512, - "grad_norm": 3.8079614639282227, - "learning_rate": 3.12914347810074e-06, - "loss": 0.1914, - "step": 860 - }, - { - "epoch": 4.2, - "grad_norm": 3.725804328918457, - "learning_rate": 3.125434832425613e-06, - "loss": 0.1579, - "step": 861 - }, - { - "epoch": 4.204878048780488, - "grad_norm": 2.974649667739868, - "learning_rate": 3.121724717912138e-06, - "loss": 0.1814, - "step": 862 - }, - { - "epoch": 4.209756097560976, - "grad_norm": 3.6391279697418213, - "learning_rate": 3.118013143273542e-06, - "loss": 0.1481, - "step": 863 - }, - { - "epoch": 4.214634146341464, - "grad_norm": 3.216643810272217, - "learning_rate": 3.1143001172264893e-06, - "loss": 0.113, - "step": 864 - }, - { - "epoch": 4.219512195121951, - "grad_norm": 3.605855941772461, - "learning_rate": 3.1105856484910474e-06, - "loss": 0.1405, - "step": 865 - }, - { - "epoch": 4.224390243902439, - "grad_norm": 2.7186765670776367, - "learning_rate": 3.1068697457906736e-06, - "loss": 0.097, - "step": 866 - }, - { - "epoch": 4.229268292682927, - "grad_norm": 3.980973243713379, - "learning_rate": 3.1031524178521938e-06, - "loss": 0.2207, - "step": 867 - }, - { - "epoch": 4.234146341463415, - "grad_norm": 3.4623806476593018, - "learning_rate": 3.0994336734057804e-06, - "loss": 0.0552, - "step": 868 - }, - { - "epoch": 4.239024390243903, - "grad_norm": 3.7556748390197754, - "learning_rate": 3.0957135211849315e-06, - "loss": 0.1743, - "step": 869 - }, - { - "epoch": 4.2439024390243905, - "grad_norm": 3.3547914028167725, - "learning_rate": 3.0919919699264535e-06, - "loss": 0.1195, - "step": 870 - }, - { - "epoch": 4.248780487804878, - "grad_norm": 4.392014503479004, - "learning_rate": 3.0882690283704355e-06, - "loss": 0.6174, - "step": 871 - }, - { - "epoch": 4.253658536585366, - "grad_norm": 2.7031409740448, - "learning_rate": 3.084544705260234e-06, - "loss": 0.1359, - "step": 872 - }, - { - "epoch": 4.258536585365854, - "grad_norm": 2.3518481254577637, - "learning_rate": 3.080819009342451e-06, - "loss": 0.0786, - "step": 873 - }, - { - "epoch": 4.263414634146342, - "grad_norm": 2.636204481124878, - "learning_rate": 3.077091949366908e-06, - "loss": 0.0677, - "step": 874 - }, - { - "epoch": 4.2682926829268295, - "grad_norm": 2.8670942783355713, - "learning_rate": 3.073363534086636e-06, - "loss": 0.1084, - "step": 875 - }, - { - "epoch": 4.273170731707317, - "grad_norm": 2.7044737339019775, - "learning_rate": 3.0696337722578444e-06, - "loss": 0.0681, - "step": 876 - }, - { - "epoch": 4.278048780487805, - "grad_norm": 3.481539487838745, - "learning_rate": 3.0659026726399072e-06, - "loss": 0.2262, - "step": 877 - }, - { - "epoch": 4.282926829268293, - "grad_norm": 3.7746224403381348, - "learning_rate": 3.0621702439953393e-06, - "loss": 0.2169, - "step": 878 - }, - { - "epoch": 4.287804878048781, - "grad_norm": 3.6386263370513916, - "learning_rate": 3.0584364950897768e-06, - "loss": 0.0581, - "step": 879 - }, - { - "epoch": 4.2926829268292686, - "grad_norm": 3.389408588409424, - "learning_rate": 3.0547014346919574e-06, - "loss": 0.1687, - "step": 880 - }, - { - "epoch": 4.297560975609756, - "grad_norm": 3.6510157585144043, - "learning_rate": 3.0509650715736977e-06, - "loss": 0.1362, - "step": 881 - }, - { - "epoch": 4.302439024390244, - "grad_norm": 3.334210157394409, - "learning_rate": 3.0472274145098744e-06, - "loss": 0.1865, - "step": 882 - }, - { - "epoch": 4.307317073170732, - "grad_norm": 4.747341632843018, - "learning_rate": 3.0434884722784026e-06, - "loss": 0.2385, - "step": 883 - }, - { - "epoch": 4.31219512195122, - "grad_norm": 3.9266858100891113, - "learning_rate": 3.0397482536602168e-06, - "loss": 0.1004, - "step": 884 - }, - { - "epoch": 4.317073170731708, - "grad_norm": 2.984821081161499, - "learning_rate": 3.0360067674392475e-06, - "loss": 0.1469, - "step": 885 - }, - { - "epoch": 4.321951219512195, - "grad_norm": 2.6379380226135254, - "learning_rate": 3.0322640224024024e-06, - "loss": 0.0829, - "step": 886 - }, - { - "epoch": 4.326829268292683, - "grad_norm": 3.885495185852051, - "learning_rate": 3.0285200273395478e-06, - "loss": 0.2256, - "step": 887 - }, - { - "epoch": 4.331707317073171, - "grad_norm": 3.950394868850708, - "learning_rate": 3.024774791043481e-06, - "loss": 0.2402, - "step": 888 - }, - { - "epoch": 4.336585365853659, - "grad_norm": 4.147830963134766, - "learning_rate": 3.021028322309921e-06, - "loss": 0.2198, - "step": 889 - }, - { - "epoch": 4.341463414634147, - "grad_norm": 4.0821638107299805, - "learning_rate": 3.0172806299374734e-06, - "loss": 0.2304, - "step": 890 - }, - { - "epoch": 4.3463414634146345, - "grad_norm": 4.142312049865723, - "learning_rate": 3.0135317227276247e-06, - "loss": 0.2864, - "step": 891 - }, - { - "epoch": 4.351219512195122, - "grad_norm": 3.008504867553711, - "learning_rate": 3.0097816094847104e-06, - "loss": 0.2045, - "step": 892 - }, - { - "epoch": 4.35609756097561, - "grad_norm": 3.1674623489379883, - "learning_rate": 3.0060302990158984e-06, - "loss": 0.0864, - "step": 893 - }, - { - "epoch": 4.360975609756098, - "grad_norm": 3.3412492275238037, - "learning_rate": 3.002277800131171e-06, - "loss": 0.076, - "step": 894 - }, - { - "epoch": 4.365853658536586, - "grad_norm": 3.067330837249756, - "learning_rate": 2.998524121643298e-06, - "loss": 0.1724, - "step": 895 - }, - { - "epoch": 4.3707317073170735, - "grad_norm": 3.9015982151031494, - "learning_rate": 2.994769272367822e-06, - "loss": 0.2, - "step": 896 - }, - { - "epoch": 4.375609756097561, - "grad_norm": 3.0136911869049072, - "learning_rate": 2.991013261123035e-06, - "loss": 0.0852, - "step": 897 - }, - { - "epoch": 4.380487804878049, - "grad_norm": 3.6834237575531006, - "learning_rate": 2.9872560967299554e-06, - "loss": 0.1449, - "step": 898 - }, - { - "epoch": 4.385365853658537, - "grad_norm": 3.3486039638519287, - "learning_rate": 2.9834977880123132e-06, - "loss": 0.0659, - "step": 899 - }, - { - "epoch": 4.390243902439025, - "grad_norm": 2.971315622329712, - "learning_rate": 2.9797383437965243e-06, - "loss": 0.1114, - "step": 900 - }, - { - "epoch": 4.3951219512195125, - "grad_norm": 2.683359146118164, - "learning_rate": 2.975977772911671e-06, - "loss": 0.0822, - "step": 901 - }, - { - "epoch": 4.4, - "grad_norm": 2.9941935539245605, - "learning_rate": 2.972216084189482e-06, - "loss": 0.0858, - "step": 902 - }, - { - "epoch": 4.404878048780488, - "grad_norm": 2.4938626289367676, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.1162, - "step": 903 - }, - { - "epoch": 4.409756097560976, - "grad_norm": 2.9364712238311768, - "learning_rate": 2.964689388573118e-06, - "loss": 0.0821, - "step": 904 - }, - { - "epoch": 4.414634146341464, - "grad_norm": 3.3638134002685547, - "learning_rate": 2.9609243993554434e-06, - "loss": 0.25, - "step": 905 - }, - { - "epoch": 4.419512195121952, - "grad_norm": 3.657277822494507, - "learning_rate": 2.9571583276533923e-06, - "loss": 0.0852, - "step": 906 - }, - { - "epoch": 4.424390243902439, - "grad_norm": 5.486263275146484, - "learning_rate": 2.9533911823116124e-06, - "loss": 0.5123, - "step": 907 - }, - { - "epoch": 4.429268292682927, - "grad_norm": 5.194574356079102, - "learning_rate": 2.9496229721772734e-06, - "loss": 0.1854, - "step": 908 - }, - { - "epoch": 4.434146341463415, - "grad_norm": 3.520110845565796, - "learning_rate": 2.9458537061000435e-06, - "loss": 0.1785, - "step": 909 - }, - { - "epoch": 4.439024390243903, - "grad_norm": 3.417991876602173, - "learning_rate": 2.9420833929320726e-06, - "loss": 0.1603, - "step": 910 - }, - { - "epoch": 4.443902439024391, - "grad_norm": 5.225805282592773, - "learning_rate": 2.93831204152797e-06, - "loss": 0.3046, - "step": 911 - }, - { - "epoch": 4.4487804878048784, - "grad_norm": 3.541433572769165, - "learning_rate": 2.9345396607447807e-06, - "loss": 0.0631, - "step": 912 - }, - { - "epoch": 4.453658536585366, - "grad_norm": 3.909377098083496, - "learning_rate": 2.9307662594419704e-06, - "loss": 0.125, - "step": 913 - }, - { - "epoch": 4.458536585365854, - "grad_norm": 3.6604416370391846, - "learning_rate": 2.9269918464814e-06, - "loss": 0.156, - "step": 914 - }, - { - "epoch": 4.463414634146342, - "grad_norm": 3.7413833141326904, - "learning_rate": 2.923216430727306e-06, - "loss": 0.3334, - "step": 915 - }, - { - "epoch": 4.46829268292683, - "grad_norm": 3.531996011734009, - "learning_rate": 2.9194400210462808e-06, - "loss": 0.2534, - "step": 916 - }, - { - "epoch": 4.473170731707317, - "grad_norm": 4.163621425628662, - "learning_rate": 2.91566262630725e-06, - "loss": 0.352, - "step": 917 - }, - { - "epoch": 4.478048780487805, - "grad_norm": 3.923635482788086, - "learning_rate": 2.9118842553814526e-06, - "loss": 0.1132, - "step": 918 - }, - { - "epoch": 4.482926829268292, - "grad_norm": 2.833768844604492, - "learning_rate": 2.9081049171424223e-06, - "loss": 0.086, - "step": 919 - }, - { - "epoch": 4.487804878048781, - "grad_norm": 2.9006292819976807, - "learning_rate": 2.9043246204659624e-06, - "loss": 0.0693, - "step": 920 - }, - { - "epoch": 4.492682926829268, - "grad_norm": 3.699376344680786, - "learning_rate": 2.9005433742301274e-06, - "loss": 0.2463, - "step": 921 - }, - { - "epoch": 4.4975609756097565, - "grad_norm": 4.882141590118408, - "learning_rate": 2.8967611873152037e-06, - "loss": 0.2275, - "step": 922 - }, - { - "epoch": 4.5024390243902435, - "grad_norm": 3.0554678440093994, - "learning_rate": 2.892978068603683e-06, - "loss": 0.0752, - "step": 923 - }, - { - "epoch": 4.507317073170732, - "grad_norm": 3.1225268840789795, - "learning_rate": 2.889194026980249e-06, - "loss": 0.1649, - "step": 924 - }, - { - "epoch": 4.512195121951219, - "grad_norm": 17.75234031677246, - "learning_rate": 2.8854090713317514e-06, - "loss": 0.0437, - "step": 925 - }, - { - "epoch": 4.517073170731708, - "grad_norm": 3.011223554611206, - "learning_rate": 2.8816232105471864e-06, - "loss": 0.0747, - "step": 926 - }, - { - "epoch": 4.521951219512195, - "grad_norm": 4.327573299407959, - "learning_rate": 2.877836453517677e-06, - "loss": 0.3884, - "step": 927 - }, - { - "epoch": 4.526829268292683, - "grad_norm": 3.8694965839385986, - "learning_rate": 2.8740488091364492e-06, - "loss": 0.2741, - "step": 928 - }, - { - "epoch": 4.53170731707317, - "grad_norm": 5.375877380371094, - "learning_rate": 2.870260286298814e-06, - "loss": 0.364, - "step": 929 - }, - { - "epoch": 4.536585365853659, - "grad_norm": 3.380891799926758, - "learning_rate": 2.866470893902147e-06, - "loss": 0.1495, - "step": 930 - }, - { - "epoch": 4.541463414634146, - "grad_norm": 3.723992109298706, - "learning_rate": 2.8626806408458626e-06, - "loss": 0.1403, - "step": 931 - }, - { - "epoch": 4.546341463414635, - "grad_norm": 3.0534417629241943, - "learning_rate": 2.8588895360313983e-06, - "loss": 0.0946, - "step": 932 - }, - { - "epoch": 4.5512195121951216, - "grad_norm": 2.8875234127044678, - "learning_rate": 2.8550975883621935e-06, - "loss": 0.1851, - "step": 933 - }, - { - "epoch": 4.55609756097561, - "grad_norm": 3.532166004180908, - "learning_rate": 2.8513048067436644e-06, - "loss": 0.178, - "step": 934 - }, - { - "epoch": 4.560975609756097, - "grad_norm": 2.942798376083374, - "learning_rate": 2.847511200083187e-06, - "loss": 0.1131, - "step": 935 - }, - { - "epoch": 4.565853658536585, - "grad_norm": 2.926874876022339, - "learning_rate": 2.843716777290074e-06, - "loss": 0.1251, - "step": 936 - }, - { - "epoch": 4.570731707317073, - "grad_norm": 3.525895357131958, - "learning_rate": 2.839921547275556e-06, - "loss": 0.0946, - "step": 937 - }, - { - "epoch": 4.575609756097561, - "grad_norm": 3.7033681869506836, - "learning_rate": 2.836125518952759e-06, - "loss": 0.1529, - "step": 938 - }, - { - "epoch": 4.580487804878048, - "grad_norm": 3.235154867172241, - "learning_rate": 2.8323287012366845e-06, - "loss": 0.2511, - "step": 939 - }, - { - "epoch": 4.585365853658536, - "grad_norm": 3.5275583267211914, - "learning_rate": 2.828531103044186e-06, - "loss": 0.1474, - "step": 940 - }, - { - "epoch": 4.590243902439024, - "grad_norm": 3.1356353759765625, - "learning_rate": 2.8247327332939512e-06, - "loss": 0.2249, - "step": 941 - }, - { - "epoch": 4.595121951219512, - "grad_norm": 3.789210081100464, - "learning_rate": 2.82093360090648e-06, - "loss": 0.2258, - "step": 942 - }, - { - "epoch": 4.6, - "grad_norm": 4.841623306274414, - "learning_rate": 2.8171337148040636e-06, - "loss": 0.2235, - "step": 943 - }, - { - "epoch": 4.6048780487804875, - "grad_norm": 3.161630630493164, - "learning_rate": 2.813333083910761e-06, - "loss": 0.1562, - "step": 944 - }, - { - "epoch": 4.609756097560975, - "grad_norm": 2.8718132972717285, - "learning_rate": 2.8095317171523835e-06, - "loss": 0.0625, - "step": 945 - }, - { - "epoch": 4.614634146341463, - "grad_norm": 3.6432454586029053, - "learning_rate": 2.805729623456469e-06, - "loss": 0.2205, - "step": 946 - }, - { - "epoch": 4.619512195121951, - "grad_norm": 4.382034778594971, - "learning_rate": 2.8019268117522624e-06, - "loss": 0.3241, - "step": 947 - }, - { - "epoch": 4.624390243902439, - "grad_norm": 3.2998175621032715, - "learning_rate": 2.798123290970695e-06, - "loss": 0.1983, - "step": 948 - }, - { - "epoch": 4.6292682926829265, - "grad_norm": 3.8665990829467773, - "learning_rate": 2.794319070044365e-06, - "loss": 0.3391, - "step": 949 - }, - { - "epoch": 4.634146341463414, - "grad_norm": 3.628403425216675, - "learning_rate": 2.790514157907512e-06, - "loss": 0.1329, - "step": 950 - }, - { - "epoch": 4.639024390243902, - "grad_norm": 2.8889615535736084, - "learning_rate": 2.786708563496002e-06, - "loss": 0.141, - "step": 951 - }, - { - "epoch": 4.64390243902439, - "grad_norm": 4.07351541519165, - "learning_rate": 2.782902295747299e-06, - "loss": 0.2935, - "step": 952 - }, - { - "epoch": 4.648780487804878, - "grad_norm": 4.220067024230957, - "learning_rate": 2.7790953636004536e-06, - "loss": 0.318, - "step": 953 - }, - { - "epoch": 4.6536585365853655, - "grad_norm": 3.8444325923919678, - "learning_rate": 2.775287775996074e-06, - "loss": 0.3388, - "step": 954 - }, - { - "epoch": 4.658536585365853, - "grad_norm": 3.197313070297241, - "learning_rate": 2.7714795418763067e-06, - "loss": 0.0925, - "step": 955 - }, - { - "epoch": 4.663414634146341, - "grad_norm": 4.0050811767578125, - "learning_rate": 2.7676706701848187e-06, - "loss": 0.2811, - "step": 956 - }, - { - "epoch": 4.668292682926829, - "grad_norm": 3.217160224914551, - "learning_rate": 2.763861169866774e-06, - "loss": 0.311, - "step": 957 - }, - { - "epoch": 4.673170731707317, - "grad_norm": 2.9892494678497314, - "learning_rate": 2.7600510498688104e-06, - "loss": 0.0582, - "step": 958 - }, - { - "epoch": 4.678048780487805, - "grad_norm": 3.954805374145508, - "learning_rate": 2.7562403191390246e-06, - "loss": 0.1238, - "step": 959 - }, - { - "epoch": 4.682926829268292, - "grad_norm": 2.9582695960998535, - "learning_rate": 2.7524289866269467e-06, - "loss": 0.1243, - "step": 960 - }, - { - "epoch": 4.68780487804878, - "grad_norm": 2.807002544403076, - "learning_rate": 2.748617061283518e-06, - "loss": 0.1388, - "step": 961 - }, - { - "epoch": 4.692682926829268, - "grad_norm": 3.980499505996704, - "learning_rate": 2.744804552061074e-06, - "loss": 0.1144, - "step": 962 - }, - { - "epoch": 4.697560975609756, - "grad_norm": 3.6389007568359375, - "learning_rate": 2.740991467913321e-06, - "loss": 0.2155, - "step": 963 - }, - { - "epoch": 4.702439024390244, - "grad_norm": 3.0950801372528076, - "learning_rate": 2.737177817795315e-06, - "loss": 0.0983, - "step": 964 - }, - { - "epoch": 4.7073170731707314, - "grad_norm": 3.1723053455352783, - "learning_rate": 2.7333636106634414e-06, - "loss": 0.1365, - "step": 965 - }, - { - "epoch": 4.712195121951219, - "grad_norm": 3.83921217918396, - "learning_rate": 2.7295488554753957e-06, - "loss": 0.1977, - "step": 966 - }, - { - "epoch": 4.717073170731707, - "grad_norm": 3.348057746887207, - "learning_rate": 2.725733561190157e-06, - "loss": 0.1311, - "step": 967 - }, - { - "epoch": 4.721951219512195, - "grad_norm": 3.828483819961548, - "learning_rate": 2.721917736767973e-06, - "loss": 0.2464, - "step": 968 - }, - { - "epoch": 4.726829268292683, - "grad_norm": 2.6004624366760254, - "learning_rate": 2.7181013911703357e-06, - "loss": 0.1088, - "step": 969 - }, - { - "epoch": 4.7317073170731705, - "grad_norm": 3.316990852355957, - "learning_rate": 2.714284533359961e-06, - "loss": 0.1492, - "step": 970 - }, - { - "epoch": 4.736585365853658, - "grad_norm": 3.8770010471343994, - "learning_rate": 2.710467172300768e-06, - "loss": 0.218, - "step": 971 - }, - { - "epoch": 4.741463414634146, - "grad_norm": 4.456376552581787, - "learning_rate": 2.706649316957857e-06, - "loss": 0.2199, - "step": 972 - }, - { - "epoch": 4.746341463414634, - "grad_norm": 3.3376309871673584, - "learning_rate": 2.7028309762974897e-06, - "loss": 0.0595, - "step": 973 - }, - { - "epoch": 4.751219512195122, - "grad_norm": 3.6755495071411133, - "learning_rate": 2.699012159287069e-06, - "loss": 0.1653, - "step": 974 - }, - { - "epoch": 4.7560975609756095, - "grad_norm": 2.939887046813965, - "learning_rate": 2.6951928748951125e-06, - "loss": 0.0681, - "step": 975 - }, - { - "epoch": 4.760975609756097, - "grad_norm": 3.4101195335388184, - "learning_rate": 2.69137313209124e-06, - "loss": 0.2046, - "step": 976 - }, - { - "epoch": 4.765853658536585, - "grad_norm": 3.9811208248138428, - "learning_rate": 2.687552939846145e-06, - "loss": 0.2255, - "step": 977 - }, - { - "epoch": 4.770731707317073, - "grad_norm": 3.484255313873291, - "learning_rate": 2.6837323071315766e-06, - "loss": 0.0512, - "step": 978 - }, - { - "epoch": 4.775609756097561, - "grad_norm": 3.9005143642425537, - "learning_rate": 2.679911242920321e-06, - "loss": 0.162, - "step": 979 - }, - { - "epoch": 4.780487804878049, - "grad_norm": 4.933374881744385, - "learning_rate": 2.6760897561861742e-06, - "loss": 0.398, - "step": 980 - }, - { - "epoch": 4.785365853658536, - "grad_norm": 3.0741539001464844, - "learning_rate": 2.672267855903927e-06, - "loss": 0.0507, - "step": 981 - }, - { - "epoch": 4.790243902439024, - "grad_norm": 3.023772716522217, - "learning_rate": 2.6684455510493413e-06, - "loss": 0.2066, - "step": 982 - }, - { - "epoch": 4.795121951219512, - "grad_norm": 3.0102407932281494, - "learning_rate": 2.6646228505991267e-06, - "loss": 0.2296, - "step": 983 - }, - { - "epoch": 4.8, - "grad_norm": 3.902200222015381, - "learning_rate": 2.6607997635309246e-06, - "loss": 0.14, - "step": 984 - }, - { - "epoch": 4.804878048780488, - "grad_norm": 3.836185932159424, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.1583, - "step": 985 - }, - { - "epoch": 4.809756097560975, - "grad_norm": 3.539628744125366, - "learning_rate": 2.653152465455639e-06, - "loss": 0.2619, - "step": 986 - }, - { - "epoch": 4.814634146341463, - "grad_norm": 4.716914653778076, - "learning_rate": 2.6493282724082913e-06, - "loss": 0.3029, - "step": 987 - }, - { - "epoch": 4.819512195121951, - "grad_norm": 3.466914176940918, - "learning_rate": 2.6455037286623864e-06, - "loss": 0.095, - "step": 988 - }, - { - "epoch": 4.824390243902439, - "grad_norm": 2.1798667907714844, - "learning_rate": 2.6416788431998935e-06, - "loss": 0.1232, - "step": 989 - }, - { - "epoch": 4.829268292682927, - "grad_norm": 3.309039354324341, - "learning_rate": 2.637853625003585e-06, - "loss": 0.3671, - "step": 990 - }, - { - "epoch": 4.8341463414634145, - "grad_norm": 3.2619435787200928, - "learning_rate": 2.6340280830570142e-06, - "loss": 0.194, - "step": 991 - }, - { - "epoch": 4.839024390243902, - "grad_norm": 3.601161003112793, - "learning_rate": 2.6302022263444947e-06, - "loss": 0.1214, - "step": 992 - }, - { - "epoch": 4.84390243902439, - "grad_norm": 4.13787841796875, - "learning_rate": 2.6263760638510793e-06, - "loss": 0.311, - "step": 993 - }, - { - "epoch": 4.848780487804878, - "grad_norm": 3.0474166870117188, - "learning_rate": 2.6225496045625394e-06, - "loss": 0.1853, - "step": 994 - }, - { - "epoch": 4.853658536585366, - "grad_norm": 4.481237411499023, - "learning_rate": 2.6187228574653428e-06, - "loss": 0.2088, - "step": 995 - }, - { - "epoch": 4.8585365853658535, - "grad_norm": 3.235966444015503, - "learning_rate": 2.614895831546633e-06, - "loss": 0.1439, - "step": 996 - }, - { - "epoch": 4.863414634146341, - "grad_norm": 4.103270053863525, - "learning_rate": 2.6110685357942096e-06, - "loss": 0.2823, - "step": 997 - }, - { - "epoch": 4.868292682926829, - "grad_norm": 4.134536266326904, - "learning_rate": 2.6072409791965048e-06, - "loss": 0.2963, - "step": 998 - }, - { - "epoch": 4.873170731707317, - "grad_norm": 4.124892711639404, - "learning_rate": 2.6034131707425638e-06, - "loss": 0.4127, - "step": 999 - }, - { - "epoch": 4.878048780487805, - "grad_norm": 3.565139055252075, - "learning_rate": 2.5995851194220223e-06, - "loss": 0.1601, - "step": 1000 - }, - { - "epoch": 4.882926829268293, - "grad_norm": 2.7548017501831055, - "learning_rate": 2.595756834225089e-06, - "loss": 0.161, - "step": 1001 - }, - { - "epoch": 4.88780487804878, - "grad_norm": 3.9297611713409424, - "learning_rate": 2.5919283241425188e-06, - "loss": 0.1013, - "step": 1002 - }, - { - "epoch": 4.892682926829268, - "grad_norm": 2.4904236793518066, - "learning_rate": 2.5880995981655965e-06, - "loss": 0.1177, - "step": 1003 - }, - { - "epoch": 4.897560975609756, - "grad_norm": 3.513308048248291, - "learning_rate": 2.584270665286113e-06, - "loss": 0.0682, - "step": 1004 - }, - { - "epoch": 4.902439024390244, - "grad_norm": 4.221067428588867, - "learning_rate": 2.580441534496346e-06, - "loss": 0.1502, - "step": 1005 - }, - { - "epoch": 4.907317073170732, - "grad_norm": 3.4298903942108154, - "learning_rate": 2.576612214789039e-06, - "loss": 0.1772, - "step": 1006 - }, - { - "epoch": 4.912195121951219, - "grad_norm": 4.402887344360352, - "learning_rate": 2.5727827151573747e-06, - "loss": 0.2029, - "step": 1007 - }, - { - "epoch": 4.917073170731707, - "grad_norm": 4.194999694824219, - "learning_rate": 2.568953044594964e-06, - "loss": 0.1269, - "step": 1008 - }, - { - "epoch": 4.921951219512195, - "grad_norm": 3.657607078552246, - "learning_rate": 2.5651232120958157e-06, - "loss": 0.1311, - "step": 1009 - }, - { - "epoch": 4.926829268292683, - "grad_norm": 4.092184543609619, - "learning_rate": 2.56129322665432e-06, - "loss": 0.1085, - "step": 1010 - }, - { - "epoch": 4.931707317073171, - "grad_norm": 3.3648242950439453, - "learning_rate": 2.5574630972652263e-06, - "loss": 0.0782, - "step": 1011 - }, - { - "epoch": 4.9365853658536585, - "grad_norm": 3.7215166091918945, - "learning_rate": 2.553632832923622e-06, - "loss": 0.1391, - "step": 1012 - }, - { - "epoch": 4.941463414634146, - "grad_norm": 4.045740127563477, - "learning_rate": 2.5498024426249107e-06, - "loss": 0.3141, - "step": 1013 - }, - { - "epoch": 4.946341463414634, - "grad_norm": 3.2363107204437256, - "learning_rate": 2.545971935364794e-06, - "loss": 0.0679, - "step": 1014 - }, - { - "epoch": 4.951219512195122, - "grad_norm": 3.057283639907837, - "learning_rate": 2.5421413201392443e-06, - "loss": 0.1382, - "step": 1015 - }, - { - "epoch": 4.95609756097561, - "grad_norm": 3.591535806655884, - "learning_rate": 2.538310605944491e-06, - "loss": 0.112, - "step": 1016 - }, - { - "epoch": 4.9609756097560975, - "grad_norm": 3.1629281044006348, - "learning_rate": 2.534479801776996e-06, - "loss": 0.1261, - "step": 1017 - }, - { - "epoch": 4.965853658536585, - "grad_norm": 2.691740036010742, - "learning_rate": 2.53064891663343e-06, - "loss": 0.2328, - "step": 1018 - }, - { - "epoch": 4.970731707317073, - "grad_norm": 3.2620503902435303, - "learning_rate": 2.526817959510655e-06, - "loss": 0.193, - "step": 1019 - }, - { - "epoch": 4.975609756097561, - "grad_norm": 3.0721535682678223, - "learning_rate": 2.5229869394057038e-06, - "loss": 0.2444, - "step": 1020 - }, - { - "epoch": 4.980487804878049, - "grad_norm": 2.6279208660125732, - "learning_rate": 2.5191558653157542e-06, - "loss": 0.1103, - "step": 1021 - }, - { - "epoch": 4.985365853658537, - "grad_norm": 2.9295670986175537, - "learning_rate": 2.515324746238113e-06, - "loss": 0.0553, - "step": 1022 - }, - { - "epoch": 4.990243902439024, - "grad_norm": 3.3960084915161133, - "learning_rate": 2.511493591170191e-06, - "loss": 0.1686, - "step": 1023 - }, - { - "epoch": 4.995121951219512, - "grad_norm": 4.138705253601074, - "learning_rate": 2.5076624091094846e-06, - "loss": 0.1208, - "step": 1024 - }, - { - "epoch": 5.0, - "grad_norm": 2.603870391845703, - "learning_rate": 2.503831209053554e-06, - "loss": 0.1216, - "step": 1025 - }, - { - "epoch": 5.004878048780488, - "grad_norm": 2.525205612182617, - "learning_rate": 2.5e-06, - "loss": 0.0984, - "step": 1026 - }, - { - "epoch": 5.009756097560976, - "grad_norm": 3.2502501010894775, - "learning_rate": 2.4961687909464462e-06, - "loss": 0.1323, - "step": 1027 - }, - { - "epoch": 5.014634146341463, - "grad_norm": 5.363409519195557, - "learning_rate": 2.492337590890516e-06, - "loss": 0.3516, - "step": 1028 - }, - { - "epoch": 5.019512195121951, - "grad_norm": 2.887723445892334, - "learning_rate": 2.4885064088298097e-06, - "loss": 0.1931, - "step": 1029 - }, - { - "epoch": 5.024390243902439, - "grad_norm": 3.4529435634613037, - "learning_rate": 2.4846752537618875e-06, - "loss": 0.0675, - "step": 1030 - }, - { - "epoch": 5.029268292682927, - "grad_norm": 4.202361106872559, - "learning_rate": 2.480844134684246e-06, - "loss": 0.1643, - "step": 1031 - }, - { - "epoch": 5.034146341463415, - "grad_norm": 2.910275459289551, - "learning_rate": 2.4770130605942966e-06, - "loss": 0.11, - "step": 1032 - }, - { - "epoch": 5.0390243902439025, - "grad_norm": 3.5430362224578857, - "learning_rate": 2.4731820404893457e-06, - "loss": 0.0614, - "step": 1033 - }, - { - "epoch": 5.04390243902439, - "grad_norm": 4.501879692077637, - "learning_rate": 2.469351083366571e-06, - "loss": 0.0954, - "step": 1034 - }, - { - "epoch": 5.048780487804878, - "grad_norm": 2.732261896133423, - "learning_rate": 2.4655201982230044e-06, - "loss": 0.0275, - "step": 1035 - }, - { - "epoch": 5.053658536585366, - "grad_norm": 3.5926437377929688, - "learning_rate": 2.4616893940555094e-06, - "loss": 0.0661, - "step": 1036 - }, - { - "epoch": 5.058536585365854, - "grad_norm": 4.790312767028809, - "learning_rate": 2.457858679860757e-06, - "loss": 0.2976, - "step": 1037 - }, - { - "epoch": 5.0634146341463415, - "grad_norm": 4.453246116638184, - "learning_rate": 2.4540280646352072e-06, - "loss": 0.1216, - "step": 1038 - }, - { - "epoch": 5.068292682926829, - "grad_norm": 3.288011074066162, - "learning_rate": 2.45019755737509e-06, - "loss": 0.0877, - "step": 1039 - }, - { - "epoch": 5.073170731707317, - "grad_norm": 3.566927671432495, - "learning_rate": 2.4463671670763787e-06, - "loss": 0.1661, - "step": 1040 - }, - { - "epoch": 5.078048780487805, - "grad_norm": 3.250047206878662, - "learning_rate": 2.4425369027347746e-06, - "loss": 0.211, - "step": 1041 - }, - { - "epoch": 5.082926829268293, - "grad_norm": 3.0214977264404297, - "learning_rate": 2.4387067733456804e-06, - "loss": 0.093, - "step": 1042 - }, - { - "epoch": 5.087804878048781, - "grad_norm": 3.8162097930908203, - "learning_rate": 2.4348767879041847e-06, - "loss": 0.0777, - "step": 1043 - }, - { - "epoch": 5.092682926829268, - "grad_norm": 3.8071560859680176, - "learning_rate": 2.4310469554050366e-06, - "loss": 0.087, - "step": 1044 - }, - { - "epoch": 5.097560975609756, - "grad_norm": 3.1032073497772217, - "learning_rate": 2.4272172848426257e-06, - "loss": 0.1105, - "step": 1045 - }, - { - "epoch": 5.102439024390244, - "grad_norm": 2.8980185985565186, - "learning_rate": 2.423387785210962e-06, - "loss": 0.0704, - "step": 1046 - }, - { - "epoch": 5.107317073170732, - "grad_norm": 3.9110755920410156, - "learning_rate": 2.4195584655036544e-06, - "loss": 0.2118, - "step": 1047 - }, - { - "epoch": 5.11219512195122, - "grad_norm": 2.678884506225586, - "learning_rate": 2.4157293347138877e-06, - "loss": 0.0664, - "step": 1048 - }, - { - "epoch": 5.117073170731707, - "grad_norm": 3.183046340942383, - "learning_rate": 2.4119004018344043e-06, - "loss": 0.1767, - "step": 1049 - }, - { - "epoch": 5.121951219512195, - "grad_norm": 3.9198925495147705, - "learning_rate": 2.408071675857482e-06, - "loss": 0.1288, - "step": 1050 - }, - { - "epoch": 5.126829268292683, - "grad_norm": 4.378621578216553, - "learning_rate": 2.404243165774912e-06, - "loss": 0.1724, - "step": 1051 - }, - { - "epoch": 5.131707317073171, - "grad_norm": 2.5509133338928223, - "learning_rate": 2.4004148805779785e-06, - "loss": 0.0382, - "step": 1052 - }, - { - "epoch": 5.136585365853659, - "grad_norm": 3.692396402359009, - "learning_rate": 2.3965868292574375e-06, - "loss": 0.0942, - "step": 1053 - }, - { - "epoch": 5.1414634146341465, - "grad_norm": 3.8537800312042236, - "learning_rate": 2.392759020803496e-06, - "loss": 0.0819, - "step": 1054 - }, - { - "epoch": 5.146341463414634, - "grad_norm": 4.02876091003418, - "learning_rate": 2.3889314642057916e-06, - "loss": 0.0866, - "step": 1055 - }, - { - "epoch": 5.151219512195122, - "grad_norm": 3.531857490539551, - "learning_rate": 2.3851041684533677e-06, - "loss": 0.1557, - "step": 1056 - }, - { - "epoch": 5.15609756097561, - "grad_norm": 2.231265068054199, - "learning_rate": 2.381277142534658e-06, - "loss": 0.0421, - "step": 1057 - }, - { - "epoch": 5.160975609756098, - "grad_norm": 3.159226894378662, - "learning_rate": 2.3774503954374614e-06, - "loss": 0.0395, - "step": 1058 - }, - { - "epoch": 5.1658536585365855, - "grad_norm": 3.0375123023986816, - "learning_rate": 2.373623936148921e-06, - "loss": 0.1869, - "step": 1059 - }, - { - "epoch": 5.170731707317073, - "grad_norm": 5.4905900955200195, - "learning_rate": 2.369797773655506e-06, - "loss": 0.1426, - "step": 1060 - }, - { - "epoch": 5.175609756097561, - "grad_norm": 2.8739638328552246, - "learning_rate": 2.3659719169429866e-06, - "loss": 0.0788, - "step": 1061 - }, - { - "epoch": 5.180487804878049, - "grad_norm": 2.612183094024658, - "learning_rate": 2.3621463749964153e-06, - "loss": 0.0449, - "step": 1062 - }, - { - "epoch": 5.185365853658537, - "grad_norm": 2.0573198795318604, - "learning_rate": 2.3583211568001073e-06, - "loss": 0.0264, - "step": 1063 - }, - { - "epoch": 5.190243902439025, - "grad_norm": 2.3667244911193848, - "learning_rate": 2.3544962713376144e-06, - "loss": 0.0507, - "step": 1064 - }, - { - "epoch": 5.195121951219512, - "grad_norm": 2.1223740577697754, - "learning_rate": 2.3506717275917095e-06, - "loss": 0.0576, - "step": 1065 - }, - { - "epoch": 5.2, - "grad_norm": 2.2630319595336914, - "learning_rate": 2.346847534544362e-06, - "loss": 0.0523, - "step": 1066 - }, - { - "epoch": 5.204878048780488, - "grad_norm": 3.201913595199585, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.0847, - "step": 1067 - }, - { - "epoch": 5.209756097560976, - "grad_norm": 2.2149481773376465, - "learning_rate": 2.3392002364690762e-06, - "loss": 0.0215, - "step": 1068 - }, - { - "epoch": 5.214634146341464, - "grad_norm": 4.425244331359863, - "learning_rate": 2.335377149400874e-06, - "loss": 0.1018, - "step": 1069 - }, - { - "epoch": 5.219512195121951, - "grad_norm": 4.548358917236328, - "learning_rate": 2.3315544489506596e-06, - "loss": 0.1485, - "step": 1070 - }, - { - "epoch": 5.224390243902439, - "grad_norm": 3.635796546936035, - "learning_rate": 2.3277321440960733e-06, - "loss": 0.111, - "step": 1071 - }, - { - "epoch": 5.229268292682927, - "grad_norm": 2.3180043697357178, - "learning_rate": 2.323910243813826e-06, - "loss": 0.0267, - "step": 1072 - }, - { - "epoch": 5.234146341463415, - "grad_norm": 3.675490379333496, - "learning_rate": 2.3200887570796798e-06, - "loss": 0.153, - "step": 1073 - }, - { - "epoch": 5.239024390243903, - "grad_norm": 2.883225202560425, - "learning_rate": 2.316267692868424e-06, - "loss": 0.0968, - "step": 1074 - }, - { - "epoch": 5.2439024390243905, - "grad_norm": 3.0320188999176025, - "learning_rate": 2.312447060153856e-06, - "loss": 0.0786, - "step": 1075 - }, - { - "epoch": 5.248780487804878, - "grad_norm": 2.682695150375366, - "learning_rate": 2.308626867908761e-06, - "loss": 0.0677, - "step": 1076 - }, - { - "epoch": 5.253658536585366, - "grad_norm": 3.941967010498047, - "learning_rate": 2.3048071251048884e-06, - "loss": 0.1059, - "step": 1077 - }, - { - "epoch": 5.258536585365854, - "grad_norm": 6.485599517822266, - "learning_rate": 2.300987840712932e-06, - "loss": 0.1331, - "step": 1078 - }, - { - "epoch": 5.263414634146342, - "grad_norm": 3.809269905090332, - "learning_rate": 2.297169023702511e-06, - "loss": 0.169, - "step": 1079 - }, - { - "epoch": 5.2682926829268295, - "grad_norm": 3.115626573562622, - "learning_rate": 2.2933506830421436e-06, - "loss": 0.1349, - "step": 1080 - }, - { - "epoch": 5.273170731707317, - "grad_norm": 2.2234909534454346, - "learning_rate": 2.2895328276992325e-06, - "loss": 0.0191, - "step": 1081 - }, - { - "epoch": 5.278048780487805, - "grad_norm": 3.896925926208496, - "learning_rate": 2.28571546664004e-06, - "loss": 0.1961, - "step": 1082 - }, - { - "epoch": 5.282926829268293, - "grad_norm": 2.4134509563446045, - "learning_rate": 2.281898608829665e-06, - "loss": 0.02, - "step": 1083 - }, - { - "epoch": 5.287804878048781, - "grad_norm": 2.7599191665649414, - "learning_rate": 2.2780822632320273e-06, - "loss": 0.0763, - "step": 1084 - }, - { - "epoch": 5.2926829268292686, - "grad_norm": 2.465637683868408, - "learning_rate": 2.2742664388098435e-06, - "loss": 0.0403, - "step": 1085 - }, - { - "epoch": 5.297560975609756, - "grad_norm": 2.4026618003845215, - "learning_rate": 2.270451144524605e-06, - "loss": 0.0982, - "step": 1086 - }, - { - "epoch": 5.302439024390244, - "grad_norm": 3.3339459896087646, - "learning_rate": 2.266636389336559e-06, - "loss": 0.09, - "step": 1087 - }, - { - "epoch": 5.307317073170732, - "grad_norm": 2.113255023956299, - "learning_rate": 2.262822182204686e-06, - "loss": 0.0267, - "step": 1088 - }, - { - "epoch": 5.31219512195122, - "grad_norm": 3.1760852336883545, - "learning_rate": 2.2590085320866798e-06, - "loss": 0.0295, - "step": 1089 - }, - { - "epoch": 5.317073170731708, - "grad_norm": 2.9674434661865234, - "learning_rate": 2.255195447938927e-06, - "loss": 0.0261, - "step": 1090 - }, - { - "epoch": 5.321951219512195, - "grad_norm": 3.4384074211120605, - "learning_rate": 2.251382938716482e-06, - "loss": 0.0936, - "step": 1091 - }, - { - "epoch": 5.326829268292683, - "grad_norm": 3.3814568519592285, - "learning_rate": 2.2475710133730533e-06, - "loss": 0.0426, - "step": 1092 - }, - { - "epoch": 5.331707317073171, - "grad_norm": 3.081317663192749, - "learning_rate": 2.243759680860975e-06, - "loss": 0.0799, - "step": 1093 - }, - { - "epoch": 5.336585365853659, - "grad_norm": 3.5608482360839844, - "learning_rate": 2.2399489501311896e-06, - "loss": 0.0906, - "step": 1094 - }, - { - "epoch": 5.341463414634147, - "grad_norm": 3.7886314392089844, - "learning_rate": 2.2361388301332265e-06, - "loss": 0.2152, - "step": 1095 - }, - { - "epoch": 5.3463414634146345, - "grad_norm": 1.9531102180480957, - "learning_rate": 2.2323293298151817e-06, - "loss": 0.0359, - "step": 1096 - }, - { - "epoch": 5.351219512195122, - "grad_norm": 2.2828023433685303, - "learning_rate": 2.2285204581236937e-06, - "loss": 0.0368, - "step": 1097 - }, - { - "epoch": 5.35609756097561, - "grad_norm": 3.110262870788574, - "learning_rate": 2.2247122240039268e-06, - "loss": 0.0426, - "step": 1098 - }, - { - "epoch": 5.360975609756098, - "grad_norm": 2.3293566703796387, - "learning_rate": 2.2209046363995464e-06, - "loss": 0.0223, - "step": 1099 - }, - { - "epoch": 5.365853658536586, - "grad_norm": 2.990884780883789, - "learning_rate": 2.217097704252701e-06, - "loss": 0.1276, - "step": 1100 - }, - { - "epoch": 5.3707317073170735, - "grad_norm": 2.568014144897461, - "learning_rate": 2.2132914365039993e-06, - "loss": 0.0639, - "step": 1101 - }, - { - "epoch": 5.375609756097561, - "grad_norm": 2.618478536605835, - "learning_rate": 2.2094858420924882e-06, - "loss": 0.0166, - "step": 1102 - }, - { - "epoch": 5.380487804878049, - "grad_norm": 4.526919364929199, - "learning_rate": 2.205680929955635e-06, - "loss": 0.144, - "step": 1103 - }, - { - "epoch": 5.385365853658537, - "grad_norm": 2.7236886024475098, - "learning_rate": 2.201876709029305e-06, - "loss": 0.1004, - "step": 1104 - }, - { - "epoch": 5.390243902439025, - "grad_norm": 2.1577632427215576, - "learning_rate": 2.198073188247738e-06, - "loss": 0.0453, - "step": 1105 - }, - { - "epoch": 5.3951219512195125, - "grad_norm": 2.5170321464538574, - "learning_rate": 2.1942703765435317e-06, - "loss": 0.0195, - "step": 1106 - }, - { - "epoch": 5.4, - "grad_norm": 3.962658643722534, - "learning_rate": 2.190468282847617e-06, - "loss": 0.1512, - "step": 1107 - }, - { - "epoch": 5.404878048780488, - "grad_norm": 4.297860622406006, - "learning_rate": 2.186666916089239e-06, - "loss": 0.2572, - "step": 1108 - }, - { - "epoch": 5.409756097560976, - "grad_norm": 2.8933565616607666, - "learning_rate": 2.1828662851959377e-06, - "loss": 0.0536, - "step": 1109 - }, - { - "epoch": 5.414634146341464, - "grad_norm": 2.9397451877593994, - "learning_rate": 2.1790663990935203e-06, - "loss": 0.0778, - "step": 1110 - }, - { - "epoch": 5.419512195121952, - "grad_norm": 3.5210094451904297, - "learning_rate": 2.1752672667060488e-06, - "loss": 0.0558, - "step": 1111 - }, - { - "epoch": 5.424390243902439, - "grad_norm": 2.9027626514434814, - "learning_rate": 2.1714688969558146e-06, - "loss": 0.041, - "step": 1112 - }, - { - "epoch": 5.429268292682927, - "grad_norm": 3.7691168785095215, - "learning_rate": 2.167671298763316e-06, - "loss": 0.1644, - "step": 1113 - }, - { - "epoch": 5.434146341463415, - "grad_norm": 3.493008852005005, - "learning_rate": 2.1638744810472414e-06, - "loss": 0.1587, - "step": 1114 - }, - { - "epoch": 5.439024390243903, - "grad_norm": 2.711196184158325, - "learning_rate": 2.1600784527244445e-06, - "loss": 0.0605, - "step": 1115 - }, - { - "epoch": 5.443902439024391, - "grad_norm": 4.365038871765137, - "learning_rate": 2.1562832227099266e-06, - "loss": 0.1897, - "step": 1116 - }, - { - "epoch": 5.4487804878048784, - "grad_norm": 4.621466159820557, - "learning_rate": 2.152488799916814e-06, - "loss": 0.1525, - "step": 1117 - }, - { - "epoch": 5.453658536585366, - "grad_norm": 4.8721089363098145, - "learning_rate": 2.148695193256336e-06, - "loss": 0.189, - "step": 1118 - }, - { - "epoch": 5.458536585365854, - "grad_norm": 2.8999173641204834, - "learning_rate": 2.1449024116378064e-06, - "loss": 0.095, - "step": 1119 - }, - { - "epoch": 5.463414634146342, - "grad_norm": 2.4865314960479736, - "learning_rate": 2.1411104639686013e-06, - "loss": 0.0432, - "step": 1120 - }, - { - "epoch": 5.46829268292683, - "grad_norm": 3.8497228622436523, - "learning_rate": 2.137319359154138e-06, - "loss": 0.0954, - "step": 1121 - }, - { - "epoch": 5.473170731707317, - "grad_norm": 2.3643507957458496, - "learning_rate": 2.133529106097853e-06, - "loss": 0.0362, - "step": 1122 - }, - { - "epoch": 5.478048780487805, - "grad_norm": 3.017826795578003, - "learning_rate": 2.1297397137011862e-06, - "loss": 0.0875, - "step": 1123 - }, - { - "epoch": 5.482926829268292, - "grad_norm": 3.239320755004883, - "learning_rate": 2.125951190863551e-06, - "loss": 0.0758, - "step": 1124 - }, - { - "epoch": 5.487804878048781, - "grad_norm": 2.566241979598999, - "learning_rate": 2.1221635464823237e-06, - "loss": 0.0605, - "step": 1125 - }, - { - "epoch": 5.492682926829268, - "grad_norm": 4.810088157653809, - "learning_rate": 2.1183767894528135e-06, - "loss": 0.2403, - "step": 1126 - }, - { - "epoch": 5.4975609756097565, - "grad_norm": 2.083263397216797, - "learning_rate": 2.114590928668249e-06, - "loss": 0.0223, - "step": 1127 - }, - { - "epoch": 5.5024390243902435, - "grad_norm": 2.6812374591827393, - "learning_rate": 2.1108059730197517e-06, - "loss": 0.0617, - "step": 1128 - }, - { - "epoch": 5.507317073170732, - "grad_norm": 3.196735143661499, - "learning_rate": 2.1070219313963173e-06, - "loss": 0.043, - "step": 1129 - }, - { - "epoch": 5.512195121951219, - "grad_norm": 2.775470495223999, - "learning_rate": 2.1032388126847967e-06, - "loss": 0.0595, - "step": 1130 - }, - { - "epoch": 5.517073170731708, - "grad_norm": 2.8632407188415527, - "learning_rate": 2.099456625769872e-06, - "loss": 0.0186, - "step": 1131 - }, - { - "epoch": 5.521951219512195, - "grad_norm": 4.075018405914307, - "learning_rate": 2.0956753795340376e-06, - "loss": 0.0616, - "step": 1132 - }, - { - "epoch": 5.526829268292683, - "grad_norm": 3.206327199935913, - "learning_rate": 2.091895082857578e-06, - "loss": 0.1895, - "step": 1133 - }, - { - "epoch": 5.53170731707317, - "grad_norm": 2.967588186264038, - "learning_rate": 2.0881157446185474e-06, - "loss": 0.0484, - "step": 1134 - }, - { - "epoch": 5.536585365853659, - "grad_norm": 2.850929021835327, - "learning_rate": 2.0843373736927506e-06, - "loss": 0.037, - "step": 1135 - }, - { - "epoch": 5.541463414634146, - "grad_norm": 2.2505147457122803, - "learning_rate": 2.08055997895372e-06, - "loss": 0.0227, - "step": 1136 - }, - { - "epoch": 5.546341463414635, - "grad_norm": 2.5258476734161377, - "learning_rate": 2.0767835692726944e-06, - "loss": 0.0296, - "step": 1137 - }, - { - "epoch": 5.5512195121951216, - "grad_norm": 3.498741388320923, - "learning_rate": 2.0730081535186e-06, - "loss": 0.16, - "step": 1138 - }, - { - "epoch": 5.55609756097561, - "grad_norm": 2.8635222911834717, - "learning_rate": 2.06923374055803e-06, - "loss": 0.0725, - "step": 1139 - }, - { - "epoch": 5.560975609756097, - "grad_norm": 2.2779290676116943, - "learning_rate": 2.0654603392552193e-06, - "loss": 0.0198, - "step": 1140 - }, - { - "epoch": 5.565853658536585, - "grad_norm": 3.1651058197021484, - "learning_rate": 2.0616879584720305e-06, - "loss": 0.1144, - "step": 1141 - }, - { - "epoch": 5.570731707317073, - "grad_norm": 2.4238595962524414, - "learning_rate": 2.057916607067928e-06, - "loss": 0.0491, - "step": 1142 - }, - { - "epoch": 5.575609756097561, - "grad_norm": 2.3248515129089355, - "learning_rate": 2.054146293899957e-06, - "loss": 0.035, - "step": 1143 - }, - { - "epoch": 5.580487804878048, - "grad_norm": 2.9506516456604004, - "learning_rate": 2.0503770278227274e-06, - "loss": 0.0639, - "step": 1144 - }, - { - "epoch": 5.585365853658536, - "grad_norm": 2.6403958797454834, - "learning_rate": 2.0466088176883876e-06, - "loss": 0.0258, - "step": 1145 - }, - { - "epoch": 5.590243902439024, - "grad_norm": 3.150115728378296, - "learning_rate": 2.042841672346608e-06, - "loss": 0.0634, - "step": 1146 - }, - { - "epoch": 5.595121951219512, - "grad_norm": 2.742691993713379, - "learning_rate": 2.039075600644557e-06, - "loss": 0.0464, - "step": 1147 - }, - { - "epoch": 5.6, - "grad_norm": 2.733694076538086, - "learning_rate": 2.0353106114268824e-06, - "loss": 0.0829, - "step": 1148 - }, - { - "epoch": 5.6048780487804875, - "grad_norm": 2.511229991912842, - "learning_rate": 2.031546713535688e-06, - "loss": 0.0321, - "step": 1149 - }, - { - "epoch": 5.609756097560975, - "grad_norm": 3.019669532775879, - "learning_rate": 2.027783915810518e-06, - "loss": 0.05, - "step": 1150 - }, - { - "epoch": 5.614634146341463, - "grad_norm": 3.497159242630005, - "learning_rate": 2.024022227088329e-06, - "loss": 0.1984, - "step": 1151 - }, - { - "epoch": 5.619512195121951, - "grad_norm": 3.4637508392333984, - "learning_rate": 2.020261656203476e-06, - "loss": 0.1673, - "step": 1152 - }, - { - "epoch": 5.624390243902439, - "grad_norm": 2.4312477111816406, - "learning_rate": 2.016502211987687e-06, - "loss": 0.1106, - "step": 1153 - }, - { - "epoch": 5.6292682926829265, - "grad_norm": 2.7801673412323, - "learning_rate": 2.0127439032700446e-06, - "loss": 0.0374, - "step": 1154 - }, - { - "epoch": 5.634146341463414, - "grad_norm": 2.9346680641174316, - "learning_rate": 2.0089867388769664e-06, - "loss": 0.0674, - "step": 1155 - }, - { - "epoch": 5.639024390243902, - "grad_norm": 2.274888277053833, - "learning_rate": 2.0052307276321793e-06, - "loss": 0.0365, - "step": 1156 - }, - { - "epoch": 5.64390243902439, - "grad_norm": 3.069890022277832, - "learning_rate": 2.001475878356703e-06, - "loss": 0.0758, - "step": 1157 - }, - { - "epoch": 5.648780487804878, - "grad_norm": 3.8594915866851807, - "learning_rate": 1.99772219986883e-06, - "loss": 0.176, - "step": 1158 - }, - { - "epoch": 5.6536585365853655, - "grad_norm": 3.4886410236358643, - "learning_rate": 1.9939697009841024e-06, - "loss": 0.0491, - "step": 1159 - }, - { - "epoch": 5.658536585365853, - "grad_norm": 2.697946786880493, - "learning_rate": 1.990218390515291e-06, - "loss": 0.0741, - "step": 1160 - }, - { - "epoch": 5.663414634146341, - "grad_norm": 3.5290887355804443, - "learning_rate": 1.9864682772723757e-06, - "loss": 0.0826, - "step": 1161 - }, - { - "epoch": 5.668292682926829, - "grad_norm": 2.0601298809051514, - "learning_rate": 1.9827193700625274e-06, - "loss": 0.0378, - "step": 1162 - }, - { - "epoch": 5.673170731707317, - "grad_norm": 3.8458635807037354, - "learning_rate": 1.978971677690081e-06, - "loss": 0.2466, - "step": 1163 - }, - { - "epoch": 5.678048780487805, - "grad_norm": 2.788210153579712, - "learning_rate": 1.97522520895652e-06, - "loss": 0.0205, - "step": 1164 - }, - { - "epoch": 5.682926829268292, - "grad_norm": 3.1904587745666504, - "learning_rate": 1.971479972660454e-06, - "loss": 0.0998, - "step": 1165 - }, - { - "epoch": 5.68780487804878, - "grad_norm": 2.4664318561553955, - "learning_rate": 1.967735977597598e-06, - "loss": 0.0217, - "step": 1166 - }, - { - "epoch": 5.692682926829268, - "grad_norm": 2.1392667293548584, - "learning_rate": 1.9639932325607538e-06, - "loss": 0.048, - "step": 1167 - }, - { - "epoch": 5.697560975609756, - "grad_norm": 3.7127058506011963, - "learning_rate": 1.9602517463397845e-06, - "loss": 0.0302, - "step": 1168 - }, - { - "epoch": 5.702439024390244, - "grad_norm": 2.916168689727783, - "learning_rate": 1.9565115277215978e-06, - "loss": 0.0724, - "step": 1169 - }, - { - "epoch": 5.7073170731707314, - "grad_norm": 2.4352428913116455, - "learning_rate": 1.952772585490127e-06, - "loss": 0.0464, - "step": 1170 - }, - { - "epoch": 5.712195121951219, - "grad_norm": 2.8311455249786377, - "learning_rate": 1.9490349284263036e-06, - "loss": 0.0239, - "step": 1171 - }, - { - "epoch": 5.717073170731707, - "grad_norm": 3.3592801094055176, - "learning_rate": 1.9452985653080443e-06, - "loss": 0.0719, - "step": 1172 - }, - { - "epoch": 5.721951219512195, - "grad_norm": 2.450922966003418, - "learning_rate": 1.9415635049102245e-06, - "loss": 0.0408, - "step": 1173 - }, - { - "epoch": 5.726829268292683, - "grad_norm": 4.750118255615234, - "learning_rate": 1.937829756004662e-06, - "loss": 0.2049, - "step": 1174 - }, - { - "epoch": 5.7317073170731705, - "grad_norm": 3.0643811225891113, - "learning_rate": 1.9340973273600944e-06, - "loss": 0.0636, - "step": 1175 - }, - { - "epoch": 5.736585365853658, - "grad_norm": 3.313904047012329, - "learning_rate": 1.930366227742157e-06, - "loss": 0.1252, - "step": 1176 - }, - { - "epoch": 5.741463414634146, - "grad_norm": 3.8996808528900146, - "learning_rate": 1.9266364659133653e-06, - "loss": 0.0687, - "step": 1177 - }, - { - "epoch": 5.746341463414634, - "grad_norm": 2.727555274963379, - "learning_rate": 1.922908050633093e-06, - "loss": 0.0333, - "step": 1178 - }, - { - "epoch": 5.751219512195122, - "grad_norm": 3.270087718963623, - "learning_rate": 1.919180990657551e-06, - "loss": 0.0792, - "step": 1179 - }, - { - "epoch": 5.7560975609756095, - "grad_norm": 2.6631274223327637, - "learning_rate": 1.9154552947397668e-06, - "loss": 0.069, - "step": 1180 - }, - { - "epoch": 5.760975609756097, - "grad_norm": 4.4460554122924805, - "learning_rate": 1.9117309716295658e-06, - "loss": 0.115, - "step": 1181 - }, - { - "epoch": 5.765853658536585, - "grad_norm": 2.5652341842651367, - "learning_rate": 1.9080080300735478e-06, - "loss": 0.0537, - "step": 1182 - }, - { - "epoch": 5.770731707317073, - "grad_norm": 3.046436071395874, - "learning_rate": 1.9042864788150695e-06, - "loss": 0.0817, - "step": 1183 - }, - { - "epoch": 5.775609756097561, - "grad_norm": 2.121629238128662, - "learning_rate": 1.9005663265942206e-06, - "loss": 0.0289, - "step": 1184 - }, - { - "epoch": 5.780487804878049, - "grad_norm": 2.271918535232544, - "learning_rate": 1.8968475821478066e-06, - "loss": 0.0357, - "step": 1185 - }, - { - "epoch": 5.785365853658536, - "grad_norm": 2.582473039627075, - "learning_rate": 1.8931302542093274e-06, - "loss": 0.0584, - "step": 1186 - }, - { - "epoch": 5.790243902439024, - "grad_norm": 2.502952814102173, - "learning_rate": 1.8894143515089539e-06, - "loss": 0.0324, - "step": 1187 - }, - { - "epoch": 5.795121951219512, - "grad_norm": 1.9735453128814697, - "learning_rate": 1.8856998827735118e-06, - "loss": 0.0338, - "step": 1188 - }, - { - "epoch": 5.8, - "grad_norm": 4.441845893859863, - "learning_rate": 1.8819868567264588e-06, - "loss": 0.1706, - "step": 1189 - }, - { - "epoch": 5.804878048780488, - "grad_norm": 2.5450692176818848, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.0463, - "step": 1190 - }, - { - "epoch": 5.809756097560975, - "grad_norm": 3.718183755874634, - "learning_rate": 1.8745651675743876e-06, - "loss": 0.1188, - "step": 1191 - }, - { - "epoch": 5.814634146341463, - "grad_norm": 3.246532678604126, - "learning_rate": 1.870856521899261e-06, - "loss": 0.0984, - "step": 1192 - }, - { - "epoch": 5.819512195121951, - "grad_norm": 2.9522783756256104, - "learning_rate": 1.867149353772267e-06, - "loss": 0.0195, - "step": 1193 - }, - { - "epoch": 5.824390243902439, - "grad_norm": 2.3266429901123047, - "learning_rate": 1.863443671899717e-06, - "loss": 0.0236, - "step": 1194 - }, - { - "epoch": 5.829268292682927, - "grad_norm": 3.696749448776245, - "learning_rate": 1.8597394849844319e-06, - "loss": 0.1108, - "step": 1195 - }, - { - "epoch": 5.8341463414634145, - "grad_norm": 2.375624179840088, - "learning_rate": 1.8560368017257229e-06, - "loss": 0.0388, - "step": 1196 - }, - { - "epoch": 5.839024390243902, - "grad_norm": 4.0437092781066895, - "learning_rate": 1.8523356308193696e-06, - "loss": 0.3098, - "step": 1197 - }, - { - "epoch": 5.84390243902439, - "grad_norm": 3.165165424346924, - "learning_rate": 1.8486359809575977e-06, - "loss": 0.0775, - "step": 1198 - }, - { - "epoch": 5.848780487804878, - "grad_norm": 4.1991190910339355, - "learning_rate": 1.8449378608290638e-06, - "loss": 0.1222, - "step": 1199 - }, - { - "epoch": 5.853658536585366, - "grad_norm": 4.6657819747924805, - "learning_rate": 1.8412412791188306e-06, - "loss": 0.1146, - "step": 1200 - }, - { - "epoch": 5.8585365853658535, - "grad_norm": 4.569516181945801, - "learning_rate": 1.8375462445083464e-06, - "loss": 0.1113, - "step": 1201 - }, - { - "epoch": 5.863414634146341, - "grad_norm": 3.1565654277801514, - "learning_rate": 1.8338527656754285e-06, - "loss": 0.0416, - "step": 1202 - }, - { - "epoch": 5.868292682926829, - "grad_norm": 3.3474619388580322, - "learning_rate": 1.830160851294239e-06, - "loss": 0.0613, - "step": 1203 - }, - { - "epoch": 5.873170731707317, - "grad_norm": 4.30797004699707, - "learning_rate": 1.8264705100352662e-06, - "loss": 0.197, - "step": 1204 - }, - { - "epoch": 5.878048780487805, - "grad_norm": 2.7259573936462402, - "learning_rate": 1.8227817505653045e-06, - "loss": 0.0821, - "step": 1205 - }, - { - "epoch": 5.882926829268293, - "grad_norm": 3.515812873840332, - "learning_rate": 1.8190945815474323e-06, - "loss": 0.1246, - "step": 1206 - }, - { - "epoch": 5.88780487804878, - "grad_norm": 2.9223313331604004, - "learning_rate": 1.8154090116409934e-06, - "loss": 0.0703, - "step": 1207 - }, - { - "epoch": 5.892682926829268, - "grad_norm": 3.9529640674591064, - "learning_rate": 1.811725049501577e-06, - "loss": 0.1078, - "step": 1208 - }, - { - "epoch": 5.897560975609756, - "grad_norm": 4.1674580574035645, - "learning_rate": 1.8080427037809941e-06, - "loss": 0.1648, - "step": 1209 - }, - { - "epoch": 5.902439024390244, - "grad_norm": 3.1308021545410156, - "learning_rate": 1.8043619831272623e-06, - "loss": 0.061, - "step": 1210 - }, - { - "epoch": 5.907317073170732, - "grad_norm": 3.9667179584503174, - "learning_rate": 1.8006828961845807e-06, - "loss": 0.1863, - "step": 1211 - }, - { - "epoch": 5.912195121951219, - "grad_norm": 5.438168048858643, - "learning_rate": 1.7970054515933124e-06, - "loss": 0.2387, - "step": 1212 - }, - { - "epoch": 5.917073170731707, - "grad_norm": 5.505797863006592, - "learning_rate": 1.793329657989964e-06, - "loss": 0.2053, - "step": 1213 - }, - { - "epoch": 5.921951219512195, - "grad_norm": 2.8043150901794434, - "learning_rate": 1.7896555240071627e-06, - "loss": 0.026, - "step": 1214 - }, - { - "epoch": 5.926829268292683, - "grad_norm": 2.836164712905884, - "learning_rate": 1.7859830582736406e-06, - "loss": 0.0735, - "step": 1215 - }, - { - "epoch": 5.931707317073171, - "grad_norm": 2.8286306858062744, - "learning_rate": 1.782312269414211e-06, - "loss": 0.0586, - "step": 1216 - }, - { - "epoch": 5.9365853658536585, - "grad_norm": 4.4354329109191895, - "learning_rate": 1.7786431660497474e-06, - "loss": 0.3086, - "step": 1217 - }, - { - "epoch": 5.941463414634146, - "grad_norm": 4.0963640213012695, - "learning_rate": 1.7749757567971678e-06, - "loss": 0.0978, - "step": 1218 - }, - { - "epoch": 5.946341463414634, - "grad_norm": 2.726062536239624, - "learning_rate": 1.7713100502694091e-06, - "loss": 0.0976, - "step": 1219 - }, - { - "epoch": 5.951219512195122, - "grad_norm": 2.6566951274871826, - "learning_rate": 1.7676460550754104e-06, - "loss": 0.02, - "step": 1220 - }, - { - "epoch": 5.95609756097561, - "grad_norm": 2.7710952758789062, - "learning_rate": 1.7639837798200923e-06, - "loss": 0.0741, - "step": 1221 - }, - { - "epoch": 5.9609756097560975, - "grad_norm": 2.3678600788116455, - "learning_rate": 1.7603232331043346e-06, - "loss": 0.0542, - "step": 1222 - }, - { - "epoch": 5.965853658536585, - "grad_norm": 6.45259428024292, - "learning_rate": 1.7566644235249591e-06, - "loss": 0.3552, - "step": 1223 - }, - { - "epoch": 5.970731707317073, - "grad_norm": 1.8916475772857666, - "learning_rate": 1.7530073596747072e-06, - "loss": 0.0405, - "step": 1224 - }, - { - "epoch": 5.975609756097561, - "grad_norm": 2.1637566089630127, - "learning_rate": 1.74935205014222e-06, - "loss": 0.0178, - "step": 1225 - }, - { - "epoch": 5.980487804878049, - "grad_norm": 2.5959200859069824, - "learning_rate": 1.7456985035120194e-06, - "loss": 0.0264, - "step": 1226 - }, - { - "epoch": 5.985365853658537, - "grad_norm": 2.50264573097229, - "learning_rate": 1.7420467283644877e-06, - "loss": 0.0555, - "step": 1227 - }, - { - "epoch": 5.990243902439024, - "grad_norm": 2.4692020416259766, - "learning_rate": 1.738396733275844e-06, - "loss": 0.0546, - "step": 1228 - }, - { - "epoch": 5.995121951219512, - "grad_norm": 5.540846824645996, - "learning_rate": 1.7347485268181309e-06, - "loss": 0.1967, - "step": 1229 - }, - { - "epoch": 6.0, - "grad_norm": 1.8322839736938477, - "learning_rate": 1.7311021175591868e-06, - "loss": 0.0491, - "step": 1230 - }, - { - "epoch": 6.004878048780488, - "grad_norm": 2.719622850418091, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.0359, - "step": 1231 - }, - { - "epoch": 6.009756097560976, - "grad_norm": 2.859675884246826, - "learning_rate": 1.7238147248878444e-06, - "loss": 0.0585, - "step": 1232 - }, - { - "epoch": 6.014634146341463, - "grad_norm": 1.6761114597320557, - "learning_rate": 1.7201737585899415e-06, - "loss": 0.0188, - "step": 1233 - }, - { - "epoch": 6.019512195121951, - "grad_norm": 2.1588776111602783, - "learning_rate": 1.7165346237197594e-06, - "loss": 0.0484, - "step": 1234 - }, - { - "epoch": 6.024390243902439, - "grad_norm": 4.209983825683594, - "learning_rate": 1.7128973288238344e-06, - "loss": 0.0776, - "step": 1235 - }, - { - "epoch": 6.029268292682927, - "grad_norm": 2.3979365825653076, - "learning_rate": 1.709261882444379e-06, - "loss": 0.0338, - "step": 1236 - }, - { - "epoch": 6.034146341463415, - "grad_norm": 3.0030531883239746, - "learning_rate": 1.705628293119268e-06, - "loss": 0.0385, - "step": 1237 - }, - { - "epoch": 6.0390243902439025, - "grad_norm": 9.65616512298584, - "learning_rate": 1.701996569382011e-06, - "loss": 0.2601, - "step": 1238 - }, - { - "epoch": 6.04390243902439, - "grad_norm": 3.0590052604675293, - "learning_rate": 1.6983667197617386e-06, - "loss": 0.034, - "step": 1239 - }, - { - "epoch": 6.048780487804878, - "grad_norm": 3.6949822902679443, - "learning_rate": 1.6947387527831813e-06, - "loss": 0.0155, - "step": 1240 - }, - { - "epoch": 6.053658536585366, - "grad_norm": 1.2870460748672485, - "learning_rate": 1.6911126769666442e-06, - "loss": 0.0078, - "step": 1241 - }, - { - "epoch": 6.058536585365854, - "grad_norm": 4.307460784912109, - "learning_rate": 1.6874885008279945e-06, - "loss": 0.1429, - "step": 1242 - }, - { - "epoch": 6.0634146341463415, - "grad_norm": 2.334972858428955, - "learning_rate": 1.683866232878637e-06, - "loss": 0.0123, - "step": 1243 - }, - { - "epoch": 6.068292682926829, - "grad_norm": 2.4121835231781006, - "learning_rate": 1.6802458816254941e-06, - "loss": 0.0139, - "step": 1244 - }, - { - "epoch": 6.073170731707317, - "grad_norm": 1.9224514961242676, - "learning_rate": 1.676627455570988e-06, - "loss": 0.0312, - "step": 1245 - }, - { - "epoch": 6.078048780487805, - "grad_norm": 2.8293309211730957, - "learning_rate": 1.6730109632130199e-06, - "loss": 0.0464, - "step": 1246 - }, - { - "epoch": 6.082926829268293, - "grad_norm": 1.6368179321289062, - "learning_rate": 1.6693964130449472e-06, - "loss": 0.0085, - "step": 1247 - }, - { - "epoch": 6.087804878048781, - "grad_norm": 2.5535073280334473, - "learning_rate": 1.6657838135555696e-06, - "loss": 0.0482, - "step": 1248 - }, - { - "epoch": 6.092682926829268, - "grad_norm": 3.7743096351623535, - "learning_rate": 1.6621731732291024e-06, - "loss": 0.0235, - "step": 1249 - }, - { - "epoch": 6.097560975609756, - "grad_norm": 2.9921820163726807, - "learning_rate": 1.6585645005451623e-06, - "loss": 0.0455, - "step": 1250 - }, - { - "epoch": 6.102439024390244, - "grad_norm": 2.369581937789917, - "learning_rate": 1.6549578039787436e-06, - "loss": 0.0499, - "step": 1251 - }, - { - "epoch": 6.107317073170732, - "grad_norm": 2.163815498352051, - "learning_rate": 1.6513530920001998e-06, - "loss": 0.0118, - "step": 1252 - }, - { - "epoch": 6.11219512195122, - "grad_norm": 2.034928560256958, - "learning_rate": 1.6477503730752237e-06, - "loss": 0.0189, - "step": 1253 - }, - { - "epoch": 6.117073170731707, - "grad_norm": 2.7306160926818848, - "learning_rate": 1.6441496556648278e-06, - "loss": 0.0492, - "step": 1254 - }, - { - "epoch": 6.121951219512195, - "grad_norm": 3.7521040439605713, - "learning_rate": 1.6405509482253234e-06, - "loss": 0.1717, - "step": 1255 - }, - { - "epoch": 6.126829268292683, - "grad_norm": 1.8965831995010376, - "learning_rate": 1.636954259208302e-06, - "loss": 0.0194, - "step": 1256 - }, - { - "epoch": 6.131707317073171, - "grad_norm": 3.010024070739746, - "learning_rate": 1.6333595970606143e-06, - "loss": 0.0334, - "step": 1257 - }, - { - "epoch": 6.136585365853659, - "grad_norm": 3.7091450691223145, - "learning_rate": 1.62976697022435e-06, - "loss": 0.0705, - "step": 1258 - }, - { - "epoch": 6.1414634146341465, - "grad_norm": 3.5719785690307617, - "learning_rate": 1.6261763871368225e-06, - "loss": 0.0322, - "step": 1259 - }, - { - "epoch": 6.146341463414634, - "grad_norm": 3.3224213123321533, - "learning_rate": 1.6225878562305403e-06, - "loss": 0.0653, - "step": 1260 - }, - { - "epoch": 6.151219512195122, - "grad_norm": 3.78924822807312, - "learning_rate": 1.6190013859331958e-06, - "loss": 0.0557, - "step": 1261 - }, - { - "epoch": 6.15609756097561, - "grad_norm": 2.429412841796875, - "learning_rate": 1.6154169846676415e-06, - "loss": 0.0277, - "step": 1262 - }, - { - "epoch": 6.160975609756098, - "grad_norm": 2.626167058944702, - "learning_rate": 1.6118346608518698e-06, - "loss": 0.0305, - "step": 1263 - }, - { - "epoch": 6.1658536585365855, - "grad_norm": 2.44846248626709, - "learning_rate": 1.6082544228989958e-06, - "loss": 0.0093, - "step": 1264 - }, - { - "epoch": 6.170731707317073, - "grad_norm": 2.9345643520355225, - "learning_rate": 1.6046762792172336e-06, - "loss": 0.0198, - "step": 1265 - }, - { - "epoch": 6.175609756097561, - "grad_norm": 3.224313497543335, - "learning_rate": 1.6011002382098806e-06, - "loss": 0.0673, - "step": 1266 - }, - { - "epoch": 6.180487804878049, - "grad_norm": 1.9066869020462036, - "learning_rate": 1.5975263082752968e-06, - "loss": 0.0115, - "step": 1267 - }, - { - "epoch": 6.185365853658537, - "grad_norm": 2.7153308391571045, - "learning_rate": 1.5939544978068816e-06, - "loss": 0.0529, - "step": 1268 - }, - { - "epoch": 6.190243902439025, - "grad_norm": 2.2173709869384766, - "learning_rate": 1.590384815193059e-06, - "loss": 0.0643, - "step": 1269 - }, - { - "epoch": 6.195121951219512, - "grad_norm": 3.1238555908203125, - "learning_rate": 1.5868172688172559e-06, - "loss": 0.064, - "step": 1270 - }, - { - "epoch": 6.2, - "grad_norm": 2.7765870094299316, - "learning_rate": 1.5832518670578802e-06, - "loss": 0.0676, - "step": 1271 - }, - { - "epoch": 6.204878048780488, - "grad_norm": 2.9892525672912598, - "learning_rate": 1.5796886182883053e-06, - "loss": 0.074, - "step": 1272 - }, - { - "epoch": 6.209756097560976, - "grad_norm": 2.0955512523651123, - "learning_rate": 1.5761275308768476e-06, - "loss": 0.0311, - "step": 1273 - }, - { - "epoch": 6.214634146341464, - "grad_norm": 1.8085861206054688, - "learning_rate": 1.5725686131867462e-06, - "loss": 0.0108, - "step": 1274 - }, - { - "epoch": 6.219512195121951, - "grad_norm": 3.026421308517456, - "learning_rate": 1.569011873576147e-06, - "loss": 0.0464, - "step": 1275 - }, - { - "epoch": 6.224390243902439, - "grad_norm": 2.3395111560821533, - "learning_rate": 1.5654573203980782e-06, - "loss": 0.0221, - "step": 1276 - }, - { - "epoch": 6.229268292682927, - "grad_norm": 3.6158692836761475, - "learning_rate": 1.5619049620004354e-06, - "loss": 0.0693, - "step": 1277 - }, - { - "epoch": 6.234146341463415, - "grad_norm": 1.6186567544937134, - "learning_rate": 1.5583548067259584e-06, - "loss": 0.0198, - "step": 1278 - }, - { - "epoch": 6.239024390243903, - "grad_norm": 2.7193195819854736, - "learning_rate": 1.5548068629122126e-06, - "loss": 0.0687, - "step": 1279 - }, - { - "epoch": 6.2439024390243905, - "grad_norm": 2.7472658157348633, - "learning_rate": 1.5512611388915711e-06, - "loss": 0.053, - "step": 1280 - }, - { - "epoch": 6.248780487804878, - "grad_norm": 4.694706439971924, - "learning_rate": 1.5477176429911934e-06, - "loss": 0.2076, - "step": 1281 - }, - { - "epoch": 6.253658536585366, - "grad_norm": 1.609309434890747, - "learning_rate": 1.5441763835330048e-06, - "loss": 0.0108, - "step": 1282 - }, - { - "epoch": 6.258536585365854, - "grad_norm": 1.7064504623413086, - "learning_rate": 1.5406373688336807e-06, - "loss": 0.0114, - "step": 1283 - }, - { - "epoch": 6.263414634146342, - "grad_norm": 1.967726469039917, - "learning_rate": 1.5371006072046225e-06, - "loss": 0.0209, - "step": 1284 - }, - { - "epoch": 6.2682926829268295, - "grad_norm": 2.4065544605255127, - "learning_rate": 1.5335661069519408e-06, - "loss": 0.0741, - "step": 1285 - }, - { - "epoch": 6.273170731707317, - "grad_norm": 2.2167603969573975, - "learning_rate": 1.5300338763764371e-06, - "loss": 0.0121, - "step": 1286 - }, - { - "epoch": 6.278048780487805, - "grad_norm": 3.229228973388672, - "learning_rate": 1.5265039237735804e-06, - "loss": 0.0226, - "step": 1287 - }, - { - "epoch": 6.282926829268293, - "grad_norm": 1.889419674873352, - "learning_rate": 1.5229762574334903e-06, - "loss": 0.0116, - "step": 1288 - }, - { - "epoch": 6.287804878048781, - "grad_norm": 3.7595815658569336, - "learning_rate": 1.5194508856409181e-06, - "loss": 0.0775, - "step": 1289 - }, - { - "epoch": 6.2926829268292686, - "grad_norm": 2.527560234069824, - "learning_rate": 1.515927816675225e-06, - "loss": 0.0355, - "step": 1290 - }, - { - "epoch": 6.297560975609756, - "grad_norm": 1.9718955755233765, - "learning_rate": 1.5124070588103648e-06, - "loss": 0.0127, - "step": 1291 - }, - { - "epoch": 6.302439024390244, - "grad_norm": 1.9010120630264282, - "learning_rate": 1.5088886203148643e-06, - "loss": 0.0188, - "step": 1292 - }, - { - "epoch": 6.307317073170732, - "grad_norm": 3.2093472480773926, - "learning_rate": 1.505372509451801e-06, - "loss": 0.0845, - "step": 1293 - }, - { - "epoch": 6.31219512195122, - "grad_norm": 1.6723257303237915, - "learning_rate": 1.5018587344787888e-06, - "loss": 0.0265, - "step": 1294 - }, - { - "epoch": 6.317073170731708, - "grad_norm": 3.246812343597412, - "learning_rate": 1.498347303647953e-06, - "loss": 0.0833, - "step": 1295 - }, - { - "epoch": 6.321951219512195, - "grad_norm": 2.887834072113037, - "learning_rate": 1.4948382252059158e-06, - "loss": 0.0416, - "step": 1296 - }, - { - "epoch": 6.326829268292683, - "grad_norm": 2.5762557983398438, - "learning_rate": 1.4913315073937742e-06, - "loss": 0.0614, - "step": 1297 - }, - { - "epoch": 6.331707317073171, - "grad_norm": 3.3746497631073, - "learning_rate": 1.4878271584470805e-06, - "loss": 0.0601, - "step": 1298 - }, - { - "epoch": 6.336585365853659, - "grad_norm": 2.4984664916992188, - "learning_rate": 1.4843251865958242e-06, - "loss": 0.0189, - "step": 1299 - }, - { - "epoch": 6.341463414634147, - "grad_norm": 3.178300619125366, - "learning_rate": 1.4808256000644128e-06, - "loss": 0.038, - "step": 1300 - }, - { - "epoch": 6.3463414634146345, - "grad_norm": 2.6362273693084717, - "learning_rate": 1.4773284070716504e-06, - "loss": 0.041, - "step": 1301 - }, - { - "epoch": 6.351219512195122, - "grad_norm": 2.1512129306793213, - "learning_rate": 1.473833615830722e-06, - "loss": 0.0227, - "step": 1302 - }, - { - "epoch": 6.35609756097561, - "grad_norm": 2.2898178100585938, - "learning_rate": 1.4703412345491692e-06, - "loss": 0.039, - "step": 1303 - }, - { - "epoch": 6.360975609756098, - "grad_norm": 2.6641080379486084, - "learning_rate": 1.4668512714288763e-06, - "loss": 0.0431, - "step": 1304 - }, - { - "epoch": 6.365853658536586, - "grad_norm": 1.7466667890548706, - "learning_rate": 1.4633637346660478e-06, - "loss": 0.013, - "step": 1305 - }, - { - "epoch": 6.3707317073170735, - "grad_norm": 2.437889575958252, - "learning_rate": 1.4598786324511892e-06, - "loss": 0.0181, - "step": 1306 - }, - { - "epoch": 6.375609756097561, - "grad_norm": 2.5054142475128174, - "learning_rate": 1.456395972969089e-06, - "loss": 0.0248, - "step": 1307 - }, - { - "epoch": 6.380487804878049, - "grad_norm": 3.2294511795043945, - "learning_rate": 1.4529157643987995e-06, - "loss": 0.0561, - "step": 1308 - }, - { - "epoch": 6.385365853658537, - "grad_norm": 2.260188341140747, - "learning_rate": 1.4494380149136162e-06, - "loss": 0.0593, - "step": 1309 - }, - { - "epoch": 6.390243902439025, - "grad_norm": 2.4961163997650146, - "learning_rate": 1.4459627326810576e-06, - "loss": 0.0257, - "step": 1310 - }, - { - "epoch": 6.3951219512195125, - "grad_norm": 3.4153239727020264, - "learning_rate": 1.4424899258628533e-06, - "loss": 0.0223, - "step": 1311 - }, - { - "epoch": 6.4, - "grad_norm": 2.6308839321136475, - "learning_rate": 1.439019602614914e-06, - "loss": 0.0112, - "step": 1312 - }, - { - "epoch": 6.404878048780488, - "grad_norm": 2.754530191421509, - "learning_rate": 1.4355517710873184e-06, - "loss": 0.068, - "step": 1313 - }, - { - "epoch": 6.409756097560976, - "grad_norm": 4.473151683807373, - "learning_rate": 1.432086439424297e-06, - "loss": 0.0825, - "step": 1314 - }, - { - "epoch": 6.414634146341464, - "grad_norm": 4.85701322555542, - "learning_rate": 1.428623615764206e-06, - "loss": 0.1812, - "step": 1315 - }, - { - "epoch": 6.419512195121952, - "grad_norm": 1.6678224802017212, - "learning_rate": 1.4251633082395117e-06, - "loss": 0.0207, - "step": 1316 - }, - { - "epoch": 6.424390243902439, - "grad_norm": 2.9730937480926514, - "learning_rate": 1.4217055249767734e-06, - "loss": 0.0617, - "step": 1317 - }, - { - "epoch": 6.429268292682927, - "grad_norm": 2.503786563873291, - "learning_rate": 1.4182502740966203e-06, - "loss": 0.0137, - "step": 1318 - }, - { - "epoch": 6.434146341463415, - "grad_norm": 3.0798017978668213, - "learning_rate": 1.4147975637137334e-06, - "loss": 0.0329, - "step": 1319 - }, - { - "epoch": 6.439024390243903, - "grad_norm": 3.008155345916748, - "learning_rate": 1.411347401936831e-06, - "loss": 0.0487, - "step": 1320 - }, - { - "epoch": 6.443902439024391, - "grad_norm": 2.5451765060424805, - "learning_rate": 1.4078997968686425e-06, - "loss": 0.0582, - "step": 1321 - }, - { - "epoch": 6.4487804878048784, - "grad_norm": 2.042696475982666, - "learning_rate": 1.404454756605893e-06, - "loss": 0.0336, - "step": 1322 - }, - { - "epoch": 6.453658536585366, - "grad_norm": 3.0421411991119385, - "learning_rate": 1.4010122892392872e-06, - "loss": 0.1372, - "step": 1323 - }, - { - "epoch": 6.458536585365854, - "grad_norm": 2.0793251991271973, - "learning_rate": 1.3975724028534842e-06, - "loss": 0.0452, - "step": 1324 - }, - { - "epoch": 6.463414634146342, - "grad_norm": 2.6149914264678955, - "learning_rate": 1.394135105527083e-06, - "loss": 0.0431, - "step": 1325 - }, - { - "epoch": 6.46829268292683, - "grad_norm": 2.818507671356201, - "learning_rate": 1.3907004053326006e-06, - "loss": 0.0242, - "step": 1326 - }, - { - "epoch": 6.473170731707317, - "grad_norm": 2.328993558883667, - "learning_rate": 1.387268310336458e-06, - "loss": 0.0293, - "step": 1327 - }, - { - "epoch": 6.478048780487805, - "grad_norm": 2.2032642364501953, - "learning_rate": 1.3838388285989552e-06, - "loss": 0.0232, - "step": 1328 - }, - { - "epoch": 6.482926829268292, - "grad_norm": 2.039983034133911, - "learning_rate": 1.380411968174254e-06, - "loss": 0.0256, - "step": 1329 - }, - { - "epoch": 6.487804878048781, - "grad_norm": 3.7261271476745605, - "learning_rate": 1.3769877371103635e-06, - "loss": 0.1285, - "step": 1330 - }, - { - "epoch": 6.492682926829268, - "grad_norm": 3.7156264781951904, - "learning_rate": 1.373566143449115e-06, - "loss": 0.1621, - "step": 1331 - }, - { - "epoch": 6.4975609756097565, - "grad_norm": 1.5905455350875854, - "learning_rate": 1.3701471952261457e-06, - "loss": 0.0126, - "step": 1332 - }, - { - "epoch": 6.5024390243902435, - "grad_norm": 2.8808465003967285, - "learning_rate": 1.3667309004708832e-06, - "loss": 0.0211, - "step": 1333 - }, - { - "epoch": 6.507317073170732, - "grad_norm": 3.9190757274627686, - "learning_rate": 1.3633172672065195e-06, - "loss": 0.062, - "step": 1334 - }, - { - "epoch": 6.512195121951219, - "grad_norm": 1.6948635578155518, - "learning_rate": 1.359906303449997e-06, - "loss": 0.0126, - "step": 1335 - }, - { - "epoch": 6.517073170731708, - "grad_norm": 2.3967642784118652, - "learning_rate": 1.3564980172119913e-06, - "loss": 0.0111, - "step": 1336 - }, - { - "epoch": 6.521951219512195, - "grad_norm": 3.5275399684906006, - "learning_rate": 1.3530924164968873e-06, - "loss": 0.1024, - "step": 1337 - }, - { - "epoch": 6.526829268292683, - "grad_norm": 2.0768814086914062, - "learning_rate": 1.3496895093027617e-06, - "loss": 0.0254, - "step": 1338 - }, - { - "epoch": 6.53170731707317, - "grad_norm": 1.8964029550552368, - "learning_rate": 1.3462893036213706e-06, - "loss": 0.0188, - "step": 1339 - }, - { - "epoch": 6.536585365853659, - "grad_norm": 1.679545283317566, - "learning_rate": 1.3428918074381203e-06, - "loss": 0.0195, - "step": 1340 - }, - { - "epoch": 6.541463414634146, - "grad_norm": 2.204637050628662, - "learning_rate": 1.3394970287320553e-06, - "loss": 0.0317, - "step": 1341 - }, - { - "epoch": 6.546341463414635, - "grad_norm": 2.014052629470825, - "learning_rate": 1.3361049754758404e-06, - "loss": 0.0191, - "step": 1342 - }, - { - "epoch": 6.5512195121951216, - "grad_norm": 1.4630589485168457, - "learning_rate": 1.3327156556357369e-06, - "loss": 0.0079, - "step": 1343 - }, - { - "epoch": 6.55609756097561, - "grad_norm": 2.876132011413574, - "learning_rate": 1.3293290771715875e-06, - "loss": 0.0345, - "step": 1344 - }, - { - "epoch": 6.560975609756097, - "grad_norm": 1.793338656425476, - "learning_rate": 1.3259452480367963e-06, - "loss": 0.0409, - "step": 1345 - }, - { - "epoch": 6.565853658536585, - "grad_norm": 2.2791552543640137, - "learning_rate": 1.3225641761783126e-06, - "loss": 0.0494, - "step": 1346 - }, - { - "epoch": 6.570731707317073, - "grad_norm": 4.255206108093262, - "learning_rate": 1.3191858695366084e-06, - "loss": 0.0842, - "step": 1347 - }, - { - "epoch": 6.575609756097561, - "grad_norm": 2.449460506439209, - "learning_rate": 1.3158103360456603e-06, - "loss": 0.0399, - "step": 1348 - }, - { - "epoch": 6.580487804878048, - "grad_norm": 2.780730724334717, - "learning_rate": 1.3124375836329362e-06, - "loss": 0.0272, - "step": 1349 - }, - { - "epoch": 6.585365853658536, - "grad_norm": 1.925681233406067, - "learning_rate": 1.3090676202193692e-06, - "loss": 0.007, - "step": 1350 - }, - { - "epoch": 6.590243902439024, - "grad_norm": 2.069791555404663, - "learning_rate": 1.3057004537193424e-06, - "loss": 0.016, - "step": 1351 - }, - { - "epoch": 6.595121951219512, - "grad_norm": 1.863872766494751, - "learning_rate": 1.302336092040673e-06, - "loss": 0.016, - "step": 1352 - }, - { - "epoch": 6.6, - "grad_norm": 2.351259231567383, - "learning_rate": 1.298974543084589e-06, - "loss": 0.0172, - "step": 1353 - }, - { - "epoch": 6.6048780487804875, - "grad_norm": 1.848115086555481, - "learning_rate": 1.2956158147457116e-06, - "loss": 0.0412, - "step": 1354 - }, - { - "epoch": 6.609756097560975, - "grad_norm": 1.6395928859710693, - "learning_rate": 1.2922599149120412e-06, - "loss": 0.0181, - "step": 1355 - }, - { - "epoch": 6.614634146341463, - "grad_norm": 2.1267426013946533, - "learning_rate": 1.2889068514649328e-06, - "loss": 0.04, - "step": 1356 - }, - { - "epoch": 6.619512195121951, - "grad_norm": 1.6603496074676514, - "learning_rate": 1.2855566322790796e-06, - "loss": 0.0108, - "step": 1357 - }, - { - "epoch": 6.624390243902439, - "grad_norm": 2.2724838256835938, - "learning_rate": 1.2822092652224989e-06, - "loss": 0.0284, - "step": 1358 - }, - { - "epoch": 6.6292682926829265, - "grad_norm": 2.222623825073242, - "learning_rate": 1.2788647581565048e-06, - "loss": 0.0128, - "step": 1359 - }, - { - "epoch": 6.634146341463414, - "grad_norm": 2.710681676864624, - "learning_rate": 1.275523118935697e-06, - "loss": 0.0184, - "step": 1360 - }, - { - "epoch": 6.639024390243902, - "grad_norm": 2.354264736175537, - "learning_rate": 1.2721843554079418e-06, - "loss": 0.0313, - "step": 1361 - }, - { - "epoch": 6.64390243902439, - "grad_norm": 3.886909008026123, - "learning_rate": 1.2688484754143493e-06, - "loss": 0.1184, - "step": 1362 - }, - { - "epoch": 6.648780487804878, - "grad_norm": 3.088468313217163, - "learning_rate": 1.2655154867892577e-06, - "loss": 0.0353, - "step": 1363 - }, - { - "epoch": 6.6536585365853655, - "grad_norm": 2.987576484680176, - "learning_rate": 1.2621853973602158e-06, - "loss": 0.0349, - "step": 1364 - }, - { - "epoch": 6.658536585365853, - "grad_norm": 1.719212293624878, - "learning_rate": 1.2588582149479645e-06, - "loss": 0.0081, - "step": 1365 - }, - { - "epoch": 6.663414634146341, - "grad_norm": 2.1641178131103516, - "learning_rate": 1.2555339473664151e-06, - "loss": 0.0279, - "step": 1366 - }, - { - "epoch": 6.668292682926829, - "grad_norm": 2.9424984455108643, - "learning_rate": 1.2522126024226347e-06, - "loss": 0.0492, - "step": 1367 - }, - { - "epoch": 6.673170731707317, - "grad_norm": 1.961077332496643, - "learning_rate": 1.2488941879168278e-06, - "loss": 0.0084, - "step": 1368 - }, - { - "epoch": 6.678048780487805, - "grad_norm": 2.302565097808838, - "learning_rate": 1.2455787116423148e-06, - "loss": 0.0486, - "step": 1369 - }, - { - "epoch": 6.682926829268292, - "grad_norm": 2.187194347381592, - "learning_rate": 1.2422661813855158e-06, - "loss": 0.0319, - "step": 1370 - }, - { - "epoch": 6.68780487804878, - "grad_norm": 2.0076377391815186, - "learning_rate": 1.238956604925934e-06, - "loss": 0.016, - "step": 1371 - }, - { - "epoch": 6.692682926829268, - "grad_norm": 4.137681484222412, - "learning_rate": 1.2356499900361333e-06, - "loss": 0.0557, - "step": 1372 - }, - { - "epoch": 6.697560975609756, - "grad_norm": 2.0039637088775635, - "learning_rate": 1.2323463444817227e-06, - "loss": 0.0219, - "step": 1373 - }, - { - "epoch": 6.702439024390244, - "grad_norm": 2.943314552307129, - "learning_rate": 1.2290456760213405e-06, - "loss": 0.0849, - "step": 1374 - }, - { - "epoch": 6.7073170731707314, - "grad_norm": 2.715120553970337, - "learning_rate": 1.2257479924066296e-06, - "loss": 0.0857, - "step": 1375 - }, - { - "epoch": 6.712195121951219, - "grad_norm": 3.144104480743408, - "learning_rate": 1.2224533013822237e-06, - "loss": 0.0648, - "step": 1376 - }, - { - "epoch": 6.717073170731707, - "grad_norm": 2.830066680908203, - "learning_rate": 1.2191616106857312e-06, - "loss": 0.0426, - "step": 1377 - }, - { - "epoch": 6.721951219512195, - "grad_norm": 3.1005899906158447, - "learning_rate": 1.2158729280477112e-06, - "loss": 0.0478, - "step": 1378 - }, - { - "epoch": 6.726829268292683, - "grad_norm": 2.2102460861206055, - "learning_rate": 1.2125872611916578e-06, - "loss": 0.0273, - "step": 1379 - }, - { - "epoch": 6.7317073170731705, - "grad_norm": 2.860288619995117, - "learning_rate": 1.2093046178339869e-06, - "loss": 0.0201, - "step": 1380 - }, - { - "epoch": 6.736585365853658, - "grad_norm": 1.5914067029953003, - "learning_rate": 1.206025005684009e-06, - "loss": 0.0148, - "step": 1381 - }, - { - "epoch": 6.741463414634146, - "grad_norm": 1.8609223365783691, - "learning_rate": 1.202748432443918e-06, - "loss": 0.0073, - "step": 1382 - }, - { - "epoch": 6.746341463414634, - "grad_norm": 3.0532407760620117, - "learning_rate": 1.1994749058087695e-06, - "loss": 0.0344, - "step": 1383 - }, - { - "epoch": 6.751219512195122, - "grad_norm": 4.0601677894592285, - "learning_rate": 1.196204433466467e-06, - "loss": 0.0837, - "step": 1384 - }, - { - "epoch": 6.7560975609756095, - "grad_norm": 2.6982672214508057, - "learning_rate": 1.192937023097738e-06, - "loss": 0.0425, - "step": 1385 - }, - { - "epoch": 6.760975609756097, - "grad_norm": 1.431360125541687, - "learning_rate": 1.1896726823761195e-06, - "loss": 0.0065, - "step": 1386 - }, - { - "epoch": 6.765853658536585, - "grad_norm": 2.116907835006714, - "learning_rate": 1.1864114189679413e-06, - "loss": 0.0133, - "step": 1387 - }, - { - "epoch": 6.770731707317073, - "grad_norm": 2.6869874000549316, - "learning_rate": 1.183153240532304e-06, - "loss": 0.0188, - "step": 1388 - }, - { - "epoch": 6.775609756097561, - "grad_norm": 2.0294089317321777, - "learning_rate": 1.179898154721063e-06, - "loss": 0.0234, - "step": 1389 - }, - { - "epoch": 6.780487804878049, - "grad_norm": 2.3081958293914795, - "learning_rate": 1.1766461691788137e-06, - "loss": 0.0208, - "step": 1390 - }, - { - "epoch": 6.785365853658536, - "grad_norm": 3.4795000553131104, - "learning_rate": 1.1733972915428665e-06, - "loss": 0.0728, - "step": 1391 - }, - { - "epoch": 6.790243902439024, - "grad_norm": 2.5121219158172607, - "learning_rate": 1.1701515294432348e-06, - "loss": 0.0291, - "step": 1392 - }, - { - "epoch": 6.795121951219512, - "grad_norm": 5.1100172996521, - "learning_rate": 1.1669088905026156e-06, - "loss": 0.0988, - "step": 1393 - }, - { - "epoch": 6.8, - "grad_norm": 2.5434396266937256, - "learning_rate": 1.163669382336371e-06, - "loss": 0.0399, - "step": 1394 - }, - { - "epoch": 6.804878048780488, - "grad_norm": 2.7811660766601562, - "learning_rate": 1.160433012552508e-06, - "loss": 0.0134, - "step": 1395 - }, - { - "epoch": 6.809756097560975, - "grad_norm": 3.2409870624542236, - "learning_rate": 1.1571997887516672e-06, - "loss": 0.0795, - "step": 1396 - }, - { - "epoch": 6.814634146341463, - "grad_norm": 2.5300986766815186, - "learning_rate": 1.1539697185270982e-06, - "loss": 0.0329, - "step": 1397 - }, - { - "epoch": 6.819512195121951, - "grad_norm": 1.8510549068450928, - "learning_rate": 1.1507428094646448e-06, - "loss": 0.0213, - "step": 1398 - }, - { - "epoch": 6.824390243902439, - "grad_norm": 1.8820618391036987, - "learning_rate": 1.1475190691427255e-06, - "loss": 0.0172, - "step": 1399 - }, - { - "epoch": 6.829268292682927, - "grad_norm": 1.3415460586547852, - "learning_rate": 1.1442985051323205e-06, - "loss": 0.0029, - "step": 1400 - }, - { - "epoch": 6.8341463414634145, - "grad_norm": 6.033786296844482, - "learning_rate": 1.1410811249969475e-06, - "loss": 0.1638, - "step": 1401 - }, - { - "epoch": 6.839024390243902, - "grad_norm": 2.990328311920166, - "learning_rate": 1.1378669362926468e-06, - "loss": 0.0779, - "step": 1402 - }, - { - "epoch": 6.84390243902439, - "grad_norm": 3.2766308784484863, - "learning_rate": 1.1346559465679656e-06, - "loss": 0.0528, - "step": 1403 - }, - { - "epoch": 6.848780487804878, - "grad_norm": 1.266032338142395, - "learning_rate": 1.1314481633639374e-06, - "loss": 0.0057, - "step": 1404 - }, - { - "epoch": 6.853658536585366, - "grad_norm": 3.1048431396484375, - "learning_rate": 1.1282435942140632e-06, - "loss": 0.1772, - "step": 1405 - }, - { - "epoch": 6.8585365853658535, - "grad_norm": 2.264822483062744, - "learning_rate": 1.1250422466442992e-06, - "loss": 0.0176, - "step": 1406 - }, - { - "epoch": 6.863414634146341, - "grad_norm": 2.0890846252441406, - "learning_rate": 1.1218441281730334e-06, - "loss": 0.0184, - "step": 1407 - }, - { - "epoch": 6.868292682926829, - "grad_norm": 1.8351202011108398, - "learning_rate": 1.1186492463110696e-06, - "loss": 0.0127, - "step": 1408 - }, - { - "epoch": 6.873170731707317, - "grad_norm": 1.447196125984192, - "learning_rate": 1.1154576085616135e-06, - "loss": 0.0094, - "step": 1409 - }, - { - "epoch": 6.878048780487805, - "grad_norm": 1.6414039134979248, - "learning_rate": 1.1122692224202491e-06, - "loss": 0.0138, - "step": 1410 - }, - { - "epoch": 6.882926829268293, - "grad_norm": 2.87068772315979, - "learning_rate": 1.1090840953749253e-06, - "loss": 0.0821, - "step": 1411 - }, - { - "epoch": 6.88780487804878, - "grad_norm": 2.0476415157318115, - "learning_rate": 1.1059022349059362e-06, - "loss": 0.0222, - "step": 1412 - }, - { - "epoch": 6.892682926829268, - "grad_norm": 4.169386863708496, - "learning_rate": 1.102723648485905e-06, - "loss": 0.1183, - "step": 1413 - }, - { - "epoch": 6.897560975609756, - "grad_norm": 4.47883415222168, - "learning_rate": 1.0995483435797643e-06, - "loss": 0.0528, - "step": 1414 - }, - { - "epoch": 6.902439024390244, - "grad_norm": 2.0025508403778076, - "learning_rate": 1.0963763276447435e-06, - "loss": 0.0106, - "step": 1415 - }, - { - "epoch": 6.907317073170732, - "grad_norm": 2.4212136268615723, - "learning_rate": 1.0932076081303442e-06, - "loss": 0.0454, - "step": 1416 - }, - { - "epoch": 6.912195121951219, - "grad_norm": 1.7873961925506592, - "learning_rate": 1.0900421924783272e-06, - "loss": 0.022, - "step": 1417 - }, - { - "epoch": 6.917073170731707, - "grad_norm": 2.0345218181610107, - "learning_rate": 1.0868800881226962e-06, - "loss": 0.0261, - "step": 1418 - }, - { - "epoch": 6.921951219512195, - "grad_norm": 3.086538314819336, - "learning_rate": 1.0837213024896764e-06, - "loss": 0.0257, - "step": 1419 - }, - { - "epoch": 6.926829268292683, - "grad_norm": 2.9401397705078125, - "learning_rate": 1.080565842997698e-06, - "loss": 0.087, - "step": 1420 - }, - { - "epoch": 6.931707317073171, - "grad_norm": 1.305415153503418, - "learning_rate": 1.0774137170573826e-06, - "loss": 0.0147, - "step": 1421 - }, - { - "epoch": 6.9365853658536585, - "grad_norm": 3.0256683826446533, - "learning_rate": 1.074264932071521e-06, - "loss": 0.1183, - "step": 1422 - }, - { - "epoch": 6.941463414634146, - "grad_norm": 2.3618743419647217, - "learning_rate": 1.0711194954350568e-06, - "loss": 0.0186, - "step": 1423 - }, - { - "epoch": 6.946341463414634, - "grad_norm": 2.004451036453247, - "learning_rate": 1.0679774145350735e-06, - "loss": 0.0222, - "step": 1424 - }, - { - "epoch": 6.951219512195122, - "grad_norm": 3.089723587036133, - "learning_rate": 1.0648386967507703e-06, - "loss": 0.0824, - "step": 1425 - }, - { - "epoch": 6.95609756097561, - "grad_norm": 1.9310235977172852, - "learning_rate": 1.0617033494534486e-06, - "loss": 0.0247, - "step": 1426 - }, - { - "epoch": 6.9609756097560975, - "grad_norm": 1.973836898803711, - "learning_rate": 1.0585713800064964e-06, - "loss": 0.0142, - "step": 1427 - }, - { - "epoch": 6.965853658536585, - "grad_norm": 2.9914112091064453, - "learning_rate": 1.0554427957653663e-06, - "loss": 0.0681, - "step": 1428 - }, - { - "epoch": 6.970731707317073, - "grad_norm": 3.356689691543579, - "learning_rate": 1.0523176040775615e-06, - "loss": 0.0916, - "step": 1429 - }, - { - "epoch": 6.975609756097561, - "grad_norm": 2.3305246829986572, - "learning_rate": 1.0491958122826173e-06, - "loss": 0.0611, - "step": 1430 - }, - { - "epoch": 6.980487804878049, - "grad_norm": 1.7383835315704346, - "learning_rate": 1.0460774277120866e-06, - "loss": 0.0182, - "step": 1431 - }, - { - "epoch": 6.985365853658537, - "grad_norm": 2.585674524307251, - "learning_rate": 1.0429624576895177e-06, - "loss": 0.0084, - "step": 1432 - }, - { - "epoch": 6.990243902439024, - "grad_norm": 3.023864269256592, - "learning_rate": 1.03985090953044e-06, - "loss": 0.0411, - "step": 1433 - }, - { - "epoch": 6.995121951219512, - "grad_norm": 2.281674861907959, - "learning_rate": 1.0367427905423497e-06, - "loss": 0.0464, - "step": 1434 - }, - { - "epoch": 7.0, - "grad_norm": 1.4372339248657227, - "learning_rate": 1.0336381080246858e-06, - "loss": 0.0124, - "step": 1435 - }, - { - "epoch": 7.004878048780488, - "grad_norm": 1.9526969194412231, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.0179, - "step": 1436 - }, - { - "epoch": 7.009756097560976, - "grad_norm": 1.7297903299331665, - "learning_rate": 1.027439081558029e-06, - "loss": 0.0119, - "step": 1437 - }, - { - "epoch": 7.014634146341463, - "grad_norm": 2.2754275798797607, - "learning_rate": 1.0243447521674967e-06, - "loss": 0.0278, - "step": 1438 - }, - { - "epoch": 7.019512195121951, - "grad_norm": 5.485769271850586, - "learning_rate": 1.021253888364276e-06, - "loss": 0.1259, - "step": 1439 - }, - { - "epoch": 7.024390243902439, - "grad_norm": 0.9085121750831604, - "learning_rate": 1.018166497407284e-06, - "loss": 0.0047, - "step": 1440 - }, - { - "epoch": 7.029268292682927, - "grad_norm": 1.0291047096252441, - "learning_rate": 1.0150825865472813e-06, - "loss": 0.0044, - "step": 1441 - }, - { - "epoch": 7.034146341463415, - "grad_norm": 0.8040009140968323, - "learning_rate": 1.0120021630268542e-06, - "loss": 0.0044, - "step": 1442 - }, - { - "epoch": 7.0390243902439025, - "grad_norm": 1.3701342344284058, - "learning_rate": 1.0089252340804025e-06, - "loss": 0.0081, - "step": 1443 - }, - { - "epoch": 7.04390243902439, - "grad_norm": 2.89591646194458, - "learning_rate": 1.0058518069341152e-06, - "loss": 0.0318, - "step": 1444 - }, - { - "epoch": 7.048780487804878, - "grad_norm": 1.3153692483901978, - "learning_rate": 1.002781888805958e-06, - "loss": 0.0067, - "step": 1445 - }, - { - "epoch": 7.053658536585366, - "grad_norm": 1.4490022659301758, - "learning_rate": 9.997154869056588e-07, - "loss": 0.0064, - "step": 1446 - }, - { - "epoch": 7.058536585365854, - "grad_norm": 1.7938638925552368, - "learning_rate": 9.966526084346837e-07, - "loss": 0.0057, - "step": 1447 - }, - { - "epoch": 7.0634146341463415, - "grad_norm": 3.7182836532592773, - "learning_rate": 9.935932605862258e-07, - "loss": 0.0365, - "step": 1448 - }, - { - "epoch": 7.068292682926829, - "grad_norm": 1.7843579053878784, - "learning_rate": 9.905374505451853e-07, - "loss": 0.0345, - "step": 1449 - }, - { - "epoch": 7.073170731707317, - "grad_norm": 2.9557483196258545, - "learning_rate": 9.874851854881565e-07, - "loss": 0.0384, - "step": 1450 - }, - { - "epoch": 7.078048780487805, - "grad_norm": 1.6237356662750244, - "learning_rate": 9.844364725834058e-07, - "loss": 0.0116, - "step": 1451 - }, - { - "epoch": 7.082926829268293, - "grad_norm": 3.7120912075042725, - "learning_rate": 9.813913189908571e-07, - "loss": 0.0267, - "step": 1452 - }, - { - "epoch": 7.087804878048781, - "grad_norm": 1.9991087913513184, - "learning_rate": 9.783497318620783e-07, - "loss": 0.0376, - "step": 1453 - }, - { - "epoch": 7.092682926829268, - "grad_norm": 1.5474026203155518, - "learning_rate": 9.75311718340258e-07, - "loss": 0.0057, - "step": 1454 - }, - { - "epoch": 7.097560975609756, - "grad_norm": 2.060807943344116, - "learning_rate": 9.722772855601927e-07, - "loss": 0.0386, - "step": 1455 - }, - { - "epoch": 7.102439024390244, - "grad_norm": 1.1991411447525024, - "learning_rate": 9.692464406482727e-07, - "loss": 0.006, - "step": 1456 - }, - { - "epoch": 7.107317073170732, - "grad_norm": 1.8907703161239624, - "learning_rate": 9.662191907224582e-07, - "loss": 0.0066, - "step": 1457 - }, - { - "epoch": 7.11219512195122, - "grad_norm": 2.0351309776306152, - "learning_rate": 9.63195542892268e-07, - "loss": 0.0201, - "step": 1458 - }, - { - "epoch": 7.117073170731707, - "grad_norm": 1.3973944187164307, - "learning_rate": 9.601755042587624e-07, - "loss": 0.0112, - "step": 1459 - }, - { - "epoch": 7.121951219512195, - "grad_norm": 1.3639394044876099, - "learning_rate": 9.571590819145244e-07, - "loss": 0.0066, - "step": 1460 - }, - { - "epoch": 7.126829268292683, - "grad_norm": 1.7362885475158691, - "learning_rate": 9.541462829436426e-07, - "loss": 0.0136, - "step": 1461 - }, - { - "epoch": 7.131707317073171, - "grad_norm": 2.9414384365081787, - "learning_rate": 9.511371144217005e-07, - "loss": 0.0228, - "step": 1462 - }, - { - "epoch": 7.136585365853659, - "grad_norm": 2.944575548171997, - "learning_rate": 9.481315834157512e-07, - "loss": 0.027, - "step": 1463 - }, - { - "epoch": 7.1414634146341465, - "grad_norm": 2.4692747592926025, - "learning_rate": 9.451296969843058e-07, - "loss": 0.0152, - "step": 1464 - }, - { - "epoch": 7.146341463414634, - "grad_norm": 1.804129719734192, - "learning_rate": 9.42131462177319e-07, - "loss": 0.0071, - "step": 1465 - }, - { - "epoch": 7.151219512195122, - "grad_norm": 1.8012168407440186, - "learning_rate": 9.39136886036166e-07, - "loss": 0.0054, - "step": 1466 - }, - { - "epoch": 7.15609756097561, - "grad_norm": 1.9471648931503296, - "learning_rate": 9.361459755936316e-07, - "loss": 0.0067, - "step": 1467 - }, - { - "epoch": 7.160975609756098, - "grad_norm": 1.8837870359420776, - "learning_rate": 9.331587378738902e-07, - "loss": 0.0105, - "step": 1468 - }, - { - "epoch": 7.1658536585365855, - "grad_norm": 2.358891487121582, - "learning_rate": 9.301751798924935e-07, - "loss": 0.0331, - "step": 1469 - }, - { - "epoch": 7.170731707317073, - "grad_norm": 1.1501671075820923, - "learning_rate": 9.27195308656349e-07, - "loss": 0.0076, - "step": 1470 - }, - { - "epoch": 7.175609756097561, - "grad_norm": 2.3329083919525146, - "learning_rate": 9.24219131163705e-07, - "loss": 0.0243, - "step": 1471 - }, - { - "epoch": 7.180487804878049, - "grad_norm": 1.6030691862106323, - "learning_rate": 9.212466544041385e-07, - "loss": 0.0051, - "step": 1472 - }, - { - "epoch": 7.185365853658537, - "grad_norm": 2.005582094192505, - "learning_rate": 9.182778853585325e-07, - "loss": 0.0146, - "step": 1473 - }, - { - "epoch": 7.190243902439025, - "grad_norm": 1.86012601852417, - "learning_rate": 9.153128309990622e-07, - "loss": 0.0273, - "step": 1474 - }, - { - "epoch": 7.195121951219512, - "grad_norm": 2.218923568725586, - "learning_rate": 9.123514982891813e-07, - "loss": 0.0225, - "step": 1475 - }, - { - "epoch": 7.2, - "grad_norm": 1.9950376749038696, - "learning_rate": 9.093938941836012e-07, - "loss": 0.0156, - "step": 1476 - }, - { - "epoch": 7.204878048780488, - "grad_norm": 1.6428661346435547, - "learning_rate": 9.064400256282757e-07, - "loss": 0.0158, - "step": 1477 - }, - { - "epoch": 7.209756097560976, - "grad_norm": 1.7983390092849731, - "learning_rate": 9.034898995603894e-07, - "loss": 0.0138, - "step": 1478 - }, - { - "epoch": 7.214634146341464, - "grad_norm": 2.2069218158721924, - "learning_rate": 9.00543522908334e-07, - "loss": 0.0308, - "step": 1479 - }, - { - "epoch": 7.219512195121951, - "grad_norm": 1.4668920040130615, - "learning_rate": 8.976009025916962e-07, - "loss": 0.006, - "step": 1480 - }, - { - "epoch": 7.224390243902439, - "grad_norm": 1.8956354856491089, - "learning_rate": 8.946620455212438e-07, - "loss": 0.0121, - "step": 1481 - }, - { - "epoch": 7.229268292682927, - "grad_norm": 2.5479676723480225, - "learning_rate": 8.917269585989027e-07, - "loss": 0.0424, - "step": 1482 - }, - { - "epoch": 7.234146341463415, - "grad_norm": 1.7482987642288208, - "learning_rate": 8.887956487177462e-07, - "loss": 0.0189, - "step": 1483 - }, - { - "epoch": 7.239024390243903, - "grad_norm": 1.5023657083511353, - "learning_rate": 8.858681227619789e-07, - "loss": 0.0118, - "step": 1484 - }, - { - "epoch": 7.2439024390243905, - "grad_norm": 1.2069121599197388, - "learning_rate": 8.829443876069163e-07, - "loss": 0.0043, - "step": 1485 - }, - { - "epoch": 7.248780487804878, - "grad_norm": 1.5843572616577148, - "learning_rate": 8.800244501189722e-07, - "loss": 0.0111, - "step": 1486 - }, - { - "epoch": 7.253658536585366, - "grad_norm": 2.541588544845581, - "learning_rate": 8.771083171556407e-07, - "loss": 0.0582, - "step": 1487 - }, - { - "epoch": 7.258536585365854, - "grad_norm": 0.9306992292404175, - "learning_rate": 8.741959955654833e-07, - "loss": 0.0051, - "step": 1488 - }, - { - "epoch": 7.263414634146342, - "grad_norm": 1.4105901718139648, - "learning_rate": 8.712874921881082e-07, - "loss": 0.0175, - "step": 1489 - }, - { - "epoch": 7.2682926829268295, - "grad_norm": 2.8943028450012207, - "learning_rate": 8.683828138541559e-07, - "loss": 0.0827, - "step": 1490 - }, - { - "epoch": 7.273170731707317, - "grad_norm": 2.512991428375244, - "learning_rate": 8.654819673852874e-07, - "loss": 0.0347, - "step": 1491 - }, - { - "epoch": 7.278048780487805, - "grad_norm": 1.6571681499481201, - "learning_rate": 8.625849595941608e-07, - "loss": 0.0055, - "step": 1492 - }, - { - "epoch": 7.282926829268293, - "grad_norm": 1.3162294626235962, - "learning_rate": 8.596917972844199e-07, - "loss": 0.0043, - "step": 1493 - }, - { - "epoch": 7.287804878048781, - "grad_norm": 1.761405110359192, - "learning_rate": 8.568024872506792e-07, - "loss": 0.0176, - "step": 1494 - }, - { - "epoch": 7.2926829268292686, - "grad_norm": 0.7546011805534363, - "learning_rate": 8.539170362785043e-07, - "loss": 0.0025, - "step": 1495 - }, - { - "epoch": 7.297560975609756, - "grad_norm": 1.6910885572433472, - "learning_rate": 8.510354511443975e-07, - "loss": 0.0093, - "step": 1496 - }, - { - "epoch": 7.302439024390244, - "grad_norm": 1.6627765893936157, - "learning_rate": 8.48157738615784e-07, - "loss": 0.0066, - "step": 1497 - }, - { - "epoch": 7.307317073170732, - "grad_norm": 0.8881242871284485, - "learning_rate": 8.452839054509926e-07, - "loss": 0.0055, - "step": 1498 - }, - { - "epoch": 7.31219512195122, - "grad_norm": 1.0791494846343994, - "learning_rate": 8.42413958399241e-07, - "loss": 0.0059, - "step": 1499 - }, - { - "epoch": 7.317073170731708, - "grad_norm": 1.5198945999145508, - "learning_rate": 8.39547904200623e-07, - "loss": 0.0049, - "step": 1500 - }, - { - "epoch": 7.321951219512195, - "grad_norm": 1.7168906927108765, - "learning_rate": 8.366857495860869e-07, - "loss": 0.0204, - "step": 1501 - }, - { - "epoch": 7.326829268292683, - "grad_norm": 1.70030677318573, - "learning_rate": 8.338275012774247e-07, - "loss": 0.0161, - "step": 1502 - }, - { - "epoch": 7.331707317073171, - "grad_norm": 2.1044130325317383, - "learning_rate": 8.309731659872522e-07, - "loss": 0.0088, - "step": 1503 - }, - { - "epoch": 7.336585365853659, - "grad_norm": 1.5040123462677002, - "learning_rate": 8.281227504189992e-07, - "loss": 0.0204, - "step": 1504 - }, - { - "epoch": 7.341463414634147, - "grad_norm": 1.6814212799072266, - "learning_rate": 8.252762612668869e-07, - "loss": 0.0238, - "step": 1505 - }, - { - "epoch": 7.3463414634146345, - "grad_norm": 2.2541606426239014, - "learning_rate": 8.224337052159154e-07, - "loss": 0.0063, - "step": 1506 - }, - { - "epoch": 7.351219512195122, - "grad_norm": 2.3999500274658203, - "learning_rate": 8.195950889418503e-07, - "loss": 0.0123, - "step": 1507 - }, - { - "epoch": 7.35609756097561, - "grad_norm": 2.8464221954345703, - "learning_rate": 8.167604191112021e-07, - "loss": 0.0296, - "step": 1508 - }, - { - "epoch": 7.360975609756098, - "grad_norm": 2.178104877471924, - "learning_rate": 8.139297023812131e-07, - "loss": 0.0148, - "step": 1509 - }, - { - "epoch": 7.365853658536586, - "grad_norm": 1.6489804983139038, - "learning_rate": 8.111029453998448e-07, - "loss": 0.0057, - "step": 1510 - }, - { - "epoch": 7.3707317073170735, - "grad_norm": 1.9705169200897217, - "learning_rate": 8.082801548057553e-07, - "loss": 0.0098, - "step": 1511 - }, - { - "epoch": 7.375609756097561, - "grad_norm": 1.2231075763702393, - "learning_rate": 8.05461337228289e-07, - "loss": 0.007, - "step": 1512 - }, - { - "epoch": 7.380487804878049, - "grad_norm": 1.5212552547454834, - "learning_rate": 8.026464992874617e-07, - "loss": 0.0058, - "step": 1513 - }, - { - "epoch": 7.385365853658537, - "grad_norm": 0.5752282738685608, - "learning_rate": 7.998356475939398e-07, - "loss": 0.0011, - "step": 1514 - }, - { - "epoch": 7.390243902439025, - "grad_norm": 1.3227447271347046, - "learning_rate": 7.970287887490289e-07, - "loss": 0.0041, - "step": 1515 - }, - { - "epoch": 7.3951219512195125, - "grad_norm": 1.2051570415496826, - "learning_rate": 7.942259293446594e-07, - "loss": 0.0027, - "step": 1516 - }, - { - "epoch": 7.4, - "grad_norm": 1.4740777015686035, - "learning_rate": 7.914270759633669e-07, - "loss": 0.006, - "step": 1517 - }, - { - "epoch": 7.404878048780488, - "grad_norm": 1.8853001594543457, - "learning_rate": 7.886322351782782e-07, - "loss": 0.0066, - "step": 1518 - }, - { - "epoch": 7.409756097560976, - "grad_norm": 1.907251238822937, - "learning_rate": 7.858414135530995e-07, - "loss": 0.0133, - "step": 1519 - }, - { - "epoch": 7.414634146341464, - "grad_norm": 1.3397895097732544, - "learning_rate": 7.83054617642095e-07, - "loss": 0.0092, - "step": 1520 - }, - { - "epoch": 7.419512195121952, - "grad_norm": 2.878927707672119, - "learning_rate": 7.802718539900761e-07, - "loss": 0.0113, - "step": 1521 - }, - { - "epoch": 7.424390243902439, - "grad_norm": 1.0312106609344482, - "learning_rate": 7.774931291323826e-07, - "loss": 0.0045, - "step": 1522 - }, - { - "epoch": 7.429268292682927, - "grad_norm": 2.2703888416290283, - "learning_rate": 7.747184495948723e-07, - "loss": 0.0692, - "step": 1523 - }, - { - "epoch": 7.434146341463415, - "grad_norm": 3.0323078632354736, - "learning_rate": 7.719478218939e-07, - "loss": 0.0462, - "step": 1524 - }, - { - "epoch": 7.439024390243903, - "grad_norm": 1.4211952686309814, - "learning_rate": 7.691812525363044e-07, - "loss": 0.008, - "step": 1525 - }, - { - "epoch": 7.443902439024391, - "grad_norm": 0.9588236808776855, - "learning_rate": 7.66418748019396e-07, - "loss": 0.0042, - "step": 1526 - }, - { - "epoch": 7.4487804878048784, - "grad_norm": 2.837219476699829, - "learning_rate": 7.636603148309363e-07, - "loss": 0.0033, - "step": 1527 - }, - { - "epoch": 7.453658536585366, - "grad_norm": 1.8552638292312622, - "learning_rate": 7.609059594491253e-07, - "loss": 0.0181, - "step": 1528 - }, - { - "epoch": 7.458536585365854, - "grad_norm": 4.836069583892822, - "learning_rate": 7.581556883425886e-07, - "loss": 0.1868, - "step": 1529 - }, - { - "epoch": 7.463414634146342, - "grad_norm": 2.180760622024536, - "learning_rate": 7.55409507970358e-07, - "loss": 0.0305, - "step": 1530 - }, - { - "epoch": 7.46829268292683, - "grad_norm": 1.0799378156661987, - "learning_rate": 7.526674247818569e-07, - "loss": 0.0027, - "step": 1531 - }, - { - "epoch": 7.473170731707317, - "grad_norm": 2.1196658611297607, - "learning_rate": 7.499294452168904e-07, - "loss": 0.019, - "step": 1532 - }, - { - "epoch": 7.478048780487805, - "grad_norm": 1.6932553052902222, - "learning_rate": 7.471955757056227e-07, - "loss": 0.0101, - "step": 1533 - }, - { - "epoch": 7.482926829268292, - "grad_norm": 1.3473751544952393, - "learning_rate": 7.444658226685656e-07, - "loss": 0.0066, - "step": 1534 - }, - { - "epoch": 7.487804878048781, - "grad_norm": 2.3404016494750977, - "learning_rate": 7.417401925165666e-07, - "loss": 0.0139, - "step": 1535 - }, - { - "epoch": 7.492682926829268, - "grad_norm": 1.2845433950424194, - "learning_rate": 7.390186916507869e-07, - "loss": 0.0053, - "step": 1536 - }, - { - "epoch": 7.4975609756097565, - "grad_norm": 1.0809649229049683, - "learning_rate": 7.363013264626914e-07, - "loss": 0.0031, - "step": 1537 - }, - { - "epoch": 7.5024390243902435, - "grad_norm": 2.2649292945861816, - "learning_rate": 7.335881033340334e-07, - "loss": 0.0257, - "step": 1538 - }, - { - "epoch": 7.507317073170732, - "grad_norm": 1.3488918542861938, - "learning_rate": 7.308790286368373e-07, - "loss": 0.0092, - "step": 1539 - }, - { - "epoch": 7.512195121951219, - "grad_norm": 2.239190101623535, - "learning_rate": 7.281741087333846e-07, - "loss": 0.024, - "step": 1540 - }, - { - "epoch": 7.517073170731708, - "grad_norm": 1.9454522132873535, - "learning_rate": 7.254733499761993e-07, - "loss": 0.0177, - "step": 1541 - }, - { - "epoch": 7.521951219512195, - "grad_norm": 1.9299415349960327, - "learning_rate": 7.22776758708035e-07, - "loss": 0.0439, - "step": 1542 - }, - { - "epoch": 7.526829268292683, - "grad_norm": 2.2676074504852295, - "learning_rate": 7.200843412618555e-07, - "loss": 0.0387, - "step": 1543 - }, - { - "epoch": 7.53170731707317, - "grad_norm": 1.2385426759719849, - "learning_rate": 7.173961039608227e-07, - "loss": 0.0082, - "step": 1544 - }, - { - "epoch": 7.536585365853659, - "grad_norm": 1.8637615442276, - "learning_rate": 7.147120531182828e-07, - "loss": 0.0194, - "step": 1545 - }, - { - "epoch": 7.541463414634146, - "grad_norm": 1.6695958375930786, - "learning_rate": 7.120321950377487e-07, - "loss": 0.006, - "step": 1546 - }, - { - "epoch": 7.546341463414635, - "grad_norm": 1.916746735572815, - "learning_rate": 7.093565360128863e-07, - "loss": 0.0104, - "step": 1547 - }, - { - "epoch": 7.5512195121951216, - "grad_norm": 1.6002378463745117, - "learning_rate": 7.066850823275024e-07, - "loss": 0.0173, - "step": 1548 - }, - { - "epoch": 7.55609756097561, - "grad_norm": 1.5249438285827637, - "learning_rate": 7.040178402555245e-07, - "loss": 0.0088, - "step": 1549 - }, - { - "epoch": 7.560975609756097, - "grad_norm": 2.1726534366607666, - "learning_rate": 7.013548160609901e-07, - "loss": 0.0098, - "step": 1550 - }, - { - "epoch": 7.565853658536585, - "grad_norm": 1.901904582977295, - "learning_rate": 6.986960159980327e-07, - "loss": 0.0196, - "step": 1551 - }, - { - "epoch": 7.570731707317073, - "grad_norm": 2.577242136001587, - "learning_rate": 6.960414463108631e-07, - "loss": 0.021, - "step": 1552 - }, - { - "epoch": 7.575609756097561, - "grad_norm": 1.4463082551956177, - "learning_rate": 6.933911132337575e-07, - "loss": 0.0076, - "step": 1553 - }, - { - "epoch": 7.580487804878048, - "grad_norm": 2.5811946392059326, - "learning_rate": 6.907450229910443e-07, - "loss": 0.0204, - "step": 1554 - }, - { - "epoch": 7.585365853658536, - "grad_norm": 1.0530297756195068, - "learning_rate": 6.881031817970848e-07, - "loss": 0.0046, - "step": 1555 - }, - { - "epoch": 7.590243902439024, - "grad_norm": 2.995915651321411, - "learning_rate": 6.854655958562625e-07, - "loss": 0.0566, - "step": 1556 - }, - { - "epoch": 7.595121951219512, - "grad_norm": 1.253089189529419, - "learning_rate": 6.82832271362969e-07, - "loss": 0.0048, - "step": 1557 - }, - { - "epoch": 7.6, - "grad_norm": 2.830667495727539, - "learning_rate": 6.802032145015855e-07, - "loss": 0.0351, - "step": 1558 - }, - { - "epoch": 7.6048780487804875, - "grad_norm": 2.8280539512634277, - "learning_rate": 6.775784314464717e-07, - "loss": 0.0171, - "step": 1559 - }, - { - "epoch": 7.609756097560975, - "grad_norm": 1.7876580953598022, - "learning_rate": 6.749579283619492e-07, - "loss": 0.01, - "step": 1560 - }, - { - "epoch": 7.614634146341463, - "grad_norm": 1.540212869644165, - "learning_rate": 6.723417114022907e-07, - "loss": 0.0162, - "step": 1561 - }, - { - "epoch": 7.619512195121951, - "grad_norm": 2.5126969814300537, - "learning_rate": 6.697297867117e-07, - "loss": 0.0237, - "step": 1562 - }, - { - "epoch": 7.624390243902439, - "grad_norm": 1.5419458150863647, - "learning_rate": 6.671221604243014e-07, - "loss": 0.0116, - "step": 1563 - }, - { - "epoch": 7.6292682926829265, - "grad_norm": 3.469961404800415, - "learning_rate": 6.645188386641257e-07, - "loss": 0.0506, - "step": 1564 - }, - { - "epoch": 7.634146341463414, - "grad_norm": 0.8771130442619324, - "learning_rate": 6.61919827545093e-07, - "loss": 0.002, - "step": 1565 - }, - { - "epoch": 7.639024390243902, - "grad_norm": 3.036559820175171, - "learning_rate": 6.593251331709993e-07, - "loss": 0.0673, - "step": 1566 - }, - { - "epoch": 7.64390243902439, - "grad_norm": 3.379220724105835, - "learning_rate": 6.567347616355049e-07, - "loss": 0.063, - "step": 1567 - }, - { - "epoch": 7.648780487804878, - "grad_norm": 0.7666990756988525, - "learning_rate": 6.541487190221163e-07, - "loss": 0.003, - "step": 1568 - }, - { - "epoch": 7.6536585365853655, - "grad_norm": 1.2181665897369385, - "learning_rate": 6.515670114041725e-07, - "loss": 0.0037, - "step": 1569 - }, - { - "epoch": 7.658536585365853, - "grad_norm": 1.0194541215896606, - "learning_rate": 6.489896448448349e-07, - "loss": 0.0043, - "step": 1570 - }, - { - "epoch": 7.663414634146341, - "grad_norm": 2.2625741958618164, - "learning_rate": 6.464166253970672e-07, - "loss": 0.0144, - "step": 1571 - }, - { - "epoch": 7.668292682926829, - "grad_norm": 1.0256692171096802, - "learning_rate": 6.43847959103624e-07, - "loss": 0.0029, - "step": 1572 - }, - { - "epoch": 7.673170731707317, - "grad_norm": 2.0418128967285156, - "learning_rate": 6.412836519970383e-07, - "loss": 0.0144, - "step": 1573 - }, - { - "epoch": 7.678048780487805, - "grad_norm": 0.8498746752738953, - "learning_rate": 6.387237100996041e-07, - "loss": 0.0026, - "step": 1574 - }, - { - "epoch": 7.682926829268292, - "grad_norm": 1.1043775081634521, - "learning_rate": 6.361681394233631e-07, - "loss": 0.0093, - "step": 1575 - }, - { - "epoch": 7.68780487804878, - "grad_norm": 1.064835786819458, - "learning_rate": 6.336169459700933e-07, - "loss": 0.0081, - "step": 1576 - }, - { - "epoch": 7.692682926829268, - "grad_norm": 1.2024056911468506, - "learning_rate": 6.310701357312909e-07, - "loss": 0.0054, - "step": 1577 - }, - { - "epoch": 7.697560975609756, - "grad_norm": 1.9509804248809814, - "learning_rate": 6.285277146881588e-07, - "loss": 0.0051, - "step": 1578 - }, - { - "epoch": 7.702439024390244, - "grad_norm": 1.8738386631011963, - "learning_rate": 6.259896888115904e-07, - "loss": 0.0118, - "step": 1579 - }, - { - "epoch": 7.7073170731707314, - "grad_norm": 1.356726884841919, - "learning_rate": 6.234560640621606e-07, - "loss": 0.009, - "step": 1580 - }, - { - "epoch": 7.712195121951219, - "grad_norm": 0.6530736684799194, - "learning_rate": 6.209268463901047e-07, - "loss": 0.0015, - "step": 1581 - }, - { - "epoch": 7.717073170731707, - "grad_norm": 1.3714262247085571, - "learning_rate": 6.184020417353084e-07, - "loss": 0.0051, - "step": 1582 - }, - { - "epoch": 7.721951219512195, - "grad_norm": 3.015583038330078, - "learning_rate": 6.158816560272962e-07, - "loss": 0.0383, - "step": 1583 - }, - { - "epoch": 7.726829268292683, - "grad_norm": 3.2355704307556152, - "learning_rate": 6.133656951852113e-07, - "loss": 0.0422, - "step": 1584 - }, - { - "epoch": 7.7317073170731705, - "grad_norm": 1.2933087348937988, - "learning_rate": 6.10854165117806e-07, - "loss": 0.0082, - "step": 1585 - }, - { - "epoch": 7.736585365853658, - "grad_norm": 1.6866157054901123, - "learning_rate": 6.083470717234285e-07, - "loss": 0.0052, - "step": 1586 - }, - { - "epoch": 7.741463414634146, - "grad_norm": 1.4597362279891968, - "learning_rate": 6.058444208900061e-07, - "loss": 0.0094, - "step": 1587 - }, - { - "epoch": 7.746341463414634, - "grad_norm": 0.9200596213340759, - "learning_rate": 6.033462184950317e-07, - "loss": 0.0034, - "step": 1588 - }, - { - "epoch": 7.751219512195122, - "grad_norm": 1.707422137260437, - "learning_rate": 6.008524704055535e-07, - "loss": 0.0141, - "step": 1589 - }, - { - "epoch": 7.7560975609756095, - "grad_norm": 1.8554565906524658, - "learning_rate": 5.983631824781572e-07, - "loss": 0.0108, - "step": 1590 - }, - { - "epoch": 7.760975609756097, - "grad_norm": 1.5421279668807983, - "learning_rate": 5.95878360558953e-07, - "loss": 0.0075, - "step": 1591 - }, - { - "epoch": 7.765853658536585, - "grad_norm": 1.5643326044082642, - "learning_rate": 5.933980104835652e-07, - "loss": 0.018, - "step": 1592 - }, - { - "epoch": 7.770731707317073, - "grad_norm": 1.7024025917053223, - "learning_rate": 5.909221380771132e-07, - "loss": 0.0207, - "step": 1593 - }, - { - "epoch": 7.775609756097561, - "grad_norm": 1.820544719696045, - "learning_rate": 5.884507491542024e-07, - "loss": 0.0217, - "step": 1594 - }, - { - "epoch": 7.780487804878049, - "grad_norm": 1.6761897802352905, - "learning_rate": 5.859838495189068e-07, - "loss": 0.0055, - "step": 1595 - }, - { - "epoch": 7.785365853658536, - "grad_norm": 2.3035616874694824, - "learning_rate": 5.835214449647602e-07, - "loss": 0.0147, - "step": 1596 - }, - { - "epoch": 7.790243902439024, - "grad_norm": 2.0507681369781494, - "learning_rate": 5.810635412747373e-07, - "loss": 0.0065, - "step": 1597 - }, - { - "epoch": 7.795121951219512, - "grad_norm": 1.3789564371109009, - "learning_rate": 5.786101442212422e-07, - "loss": 0.0077, - "step": 1598 - }, - { - "epoch": 7.8, - "grad_norm": 3.313107490539551, - "learning_rate": 5.761612595660979e-07, - "loss": 0.0699, - "step": 1599 - }, - { - "epoch": 7.804878048780488, - "grad_norm": 1.2391237020492554, - "learning_rate": 5.737168930605272e-07, - "loss": 0.0017, - "step": 1600 - }, - { - "epoch": 7.809756097560975, - "grad_norm": 1.1187714338302612, - "learning_rate": 5.712770504451426e-07, - "loss": 0.0101, - "step": 1601 - }, - { - "epoch": 7.814634146341463, - "grad_norm": 2.7611069679260254, - "learning_rate": 5.688417374499336e-07, - "loss": 0.0143, - "step": 1602 - }, - { - "epoch": 7.819512195121951, - "grad_norm": 1.627295732498169, - "learning_rate": 5.664109597942504e-07, - "loss": 0.0062, - "step": 1603 - }, - { - "epoch": 7.824390243902439, - "grad_norm": 4.538354396820068, - "learning_rate": 5.639847231867917e-07, - "loss": 0.1058, - "step": 1604 - }, - { - "epoch": 7.829268292682927, - "grad_norm": 1.783469319343567, - "learning_rate": 5.61563033325594e-07, - "loss": 0.0178, - "step": 1605 - }, - { - "epoch": 7.8341463414634145, - "grad_norm": 2.259584665298462, - "learning_rate": 5.591458958980123e-07, - "loss": 0.0204, - "step": 1606 - }, - { - "epoch": 7.839024390243902, - "grad_norm": 2.0741965770721436, - "learning_rate": 5.567333165807115e-07, - "loss": 0.0201, - "step": 1607 - }, - { - "epoch": 7.84390243902439, - "grad_norm": 0.8751707077026367, - "learning_rate": 5.543253010396538e-07, - "loss": 0.0077, - "step": 1608 - }, - { - "epoch": 7.848780487804878, - "grad_norm": 1.7383732795715332, - "learning_rate": 5.519218549300806e-07, - "loss": 0.0176, - "step": 1609 - }, - { - "epoch": 7.853658536585366, - "grad_norm": 2.0462191104888916, - "learning_rate": 5.495229838965021e-07, - "loss": 0.031, - "step": 1610 - }, - { - "epoch": 7.8585365853658535, - "grad_norm": 1.3201459646224976, - "learning_rate": 5.471286935726866e-07, - "loss": 0.0062, - "step": 1611 - }, - { - "epoch": 7.863414634146341, - "grad_norm": 2.9285616874694824, - "learning_rate": 5.447389895816416e-07, - "loss": 0.0615, - "step": 1612 - }, - { - "epoch": 7.868292682926829, - "grad_norm": 3.1918647289276123, - "learning_rate": 5.423538775356049e-07, - "loss": 0.0377, - "step": 1613 - }, - { - "epoch": 7.873170731707317, - "grad_norm": 1.406246542930603, - "learning_rate": 5.399733630360287e-07, - "loss": 0.0122, - "step": 1614 - }, - { - "epoch": 7.878048780487805, - "grad_norm": 1.7651537656784058, - "learning_rate": 5.375974516735713e-07, - "loss": 0.015, - "step": 1615 - }, - { - "epoch": 7.882926829268293, - "grad_norm": 1.9614673852920532, - "learning_rate": 5.352261490280767e-07, - "loss": 0.0058, - "step": 1616 - }, - { - "epoch": 7.88780487804878, - "grad_norm": 1.6031639575958252, - "learning_rate": 5.328594606685661e-07, - "loss": 0.0041, - "step": 1617 - }, - { - "epoch": 7.892682926829268, - "grad_norm": 0.9787303805351257, - "learning_rate": 5.304973921532264e-07, - "loss": 0.0067, - "step": 1618 - }, - { - "epoch": 7.897560975609756, - "grad_norm": 1.2693779468536377, - "learning_rate": 5.281399490293923e-07, - "loss": 0.0064, - "step": 1619 - }, - { - "epoch": 7.902439024390244, - "grad_norm": 1.8421361446380615, - "learning_rate": 5.257871368335357e-07, - "loss": 0.0182, - "step": 1620 - }, - { - "epoch": 7.907317073170732, - "grad_norm": 0.9667096138000488, - "learning_rate": 5.234389610912552e-07, - "loss": 0.0024, - "step": 1621 - }, - { - "epoch": 7.912195121951219, - "grad_norm": 3.2266018390655518, - "learning_rate": 5.210954273172578e-07, - "loss": 0.02, - "step": 1622 - }, - { - "epoch": 7.917073170731707, - "grad_norm": 1.5821634531021118, - "learning_rate": 5.187565410153497e-07, - "loss": 0.024, - "step": 1623 - }, - { - "epoch": 7.921951219512195, - "grad_norm": 1.9864275455474854, - "learning_rate": 5.164223076784239e-07, - "loss": 0.0103, - "step": 1624 - }, - { - "epoch": 7.926829268292683, - "grad_norm": 1.866466999053955, - "learning_rate": 5.14092732788444e-07, - "loss": 0.0268, - "step": 1625 - }, - { - "epoch": 7.931707317073171, - "grad_norm": 1.165686011314392, - "learning_rate": 5.117678218164337e-07, - "loss": 0.0085, - "step": 1626 - }, - { - "epoch": 7.9365853658536585, - "grad_norm": 1.1883208751678467, - "learning_rate": 5.094475802224644e-07, - "loss": 0.006, - "step": 1627 - }, - { - "epoch": 7.941463414634146, - "grad_norm": 1.5121057033538818, - "learning_rate": 5.071320134556404e-07, - "loss": 0.003, - "step": 1628 - }, - { - "epoch": 7.946341463414634, - "grad_norm": 1.1923614740371704, - "learning_rate": 5.048211269540868e-07, - "loss": 0.0064, - "step": 1629 - }, - { - "epoch": 7.951219512195122, - "grad_norm": 1.33751380443573, - "learning_rate": 5.025149261449391e-07, - "loss": 0.0082, - "step": 1630 - }, - { - "epoch": 7.95609756097561, - "grad_norm": 1.9143925905227661, - "learning_rate": 5.002134164443262e-07, - "loss": 0.0202, - "step": 1631 - }, - { - "epoch": 7.9609756097560975, - "grad_norm": 1.2547078132629395, - "learning_rate": 4.979166032573607e-07, - "loss": 0.0033, - "step": 1632 - }, - { - "epoch": 7.965853658536585, - "grad_norm": 2.3050332069396973, - "learning_rate": 4.956244919781247e-07, - "loss": 0.052, - "step": 1633 - }, - { - "epoch": 7.970731707317073, - "grad_norm": 1.4462478160858154, - "learning_rate": 4.933370879896604e-07, - "loss": 0.0049, - "step": 1634 - }, - { - "epoch": 7.975609756097561, - "grad_norm": 1.519913911819458, - "learning_rate": 4.91054396663952e-07, - "loss": 0.0102, - "step": 1635 - }, - { - "epoch": 7.980487804878049, - "grad_norm": 2.9544193744659424, - "learning_rate": 4.887764233619163e-07, - "loss": 0.0112, - "step": 1636 - }, - { - "epoch": 7.985365853658537, - "grad_norm": 0.9778392314910889, - "learning_rate": 4.865031734333919e-07, - "loss": 0.0032, - "step": 1637 - }, - { - "epoch": 7.990243902439024, - "grad_norm": 2.783501386642456, - "learning_rate": 4.842346522171226e-07, - "loss": 0.012, - "step": 1638 - }, - { - "epoch": 7.995121951219512, - "grad_norm": 1.5644093751907349, - "learning_rate": 4.819708650407467e-07, - "loss": 0.0184, - "step": 1639 - }, - { - "epoch": 8.0, - "grad_norm": 1.5741018056869507, - "learning_rate": 4.797118172207863e-07, - "loss": 0.0112, - "step": 1640 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.717111846895616e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-1845/chat_template.jinja b/metallama3_8b/limo/checkpoint-1845/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-1845/config.json b/metallama3_8b/limo/checkpoint-1845/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-1845/generation_config.json b/metallama3_8b/limo/checkpoint-1845/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-1845/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1845/model-00001-of-00007.safetensors deleted file mode 100644 index be5e7e3aeebcbb05955d5d2980028e95ee250a82..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b7ea2ed90350ff2c0f734428bd5928bb71687d1e2e496de788a20f7ba3597c4 -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-1845/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1845/model-00002-of-00007.safetensors deleted file mode 100644 index 8e73c6782869c667521af4b433d166b2e9665ec5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:543b78f558d4f88429eb6997c93f7996c6f0950b8b90ec923d29f9aa864d8879 -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-1845/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1845/model-00003-of-00007.safetensors deleted file mode 100644 index d0efc6c8b8c18bdde5f58bba42c987cd8a81ea15..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66b014f79bd65dddb6def22d796c36f675e56d180fd905f772f3ca984343a06f -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-1845/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1845/model-00004-of-00007.safetensors deleted file mode 100644 index 5c2e69e64785c3e51ad3c45316d22577f1325b57..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22cdc3f45325f2e96ba6e10525e5937089057ce540d3bf7eaeff7af496eed7c4 -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-1845/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1845/model-00005-of-00007.safetensors deleted file mode 100644 index 180f2f6e39bee85840c82ec5fe8679ed1e647108..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dfb3e31b8860f3c8cc3e6733fadea870d1af1d8f9916eda6641741f323fcaceb -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-1845/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1845/model-00006-of-00007.safetensors deleted file mode 100644 index 9fadd682d4be84fd1ae5234fcc973f921d41e91e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb57356a2bf0edfa8eab76e0ffc16cdccec451cdfab5d7287682e894a260fab3 -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-1845/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-1845/model-00007-of-00007.safetensors deleted file mode 100644 index d9fbd0c655bc981e0ca60d3a293c332a04139e63..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b21e2688ca25e2d710530e82d45e1f5eb508742e3d89a1894b43e6d8f7fbf2d -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-1845/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-1845/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-1845/rng_state_0.pth b/metallama3_8b/limo/checkpoint-1845/rng_state_0.pth deleted file mode 100644 index c52ec8f5d66c6a990609422386c047d0c3ed3970..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:543ef05f530d40ee20b8d626b07a69b86597aca643e48897571062f973efe84f -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1845/rng_state_1.pth b/metallama3_8b/limo/checkpoint-1845/rng_state_1.pth deleted file mode 100644 index 7e4ae755d2c391c6486028b2ab09f40e1e5b6b3f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a23f732e43838ce0398d2636885ac16badbb9bcbc04d1406069ba3027bc5ae0 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1845/rng_state_2.pth b/metallama3_8b/limo/checkpoint-1845/rng_state_2.pth deleted file mode 100644 index 47425e0477082be97b4d8dda14c0159e7914ebb0..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e10cce960e7068b051c05e35ed6160656be9091c63f13796ac2ed7e9c84e5a72 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1845/rng_state_3.pth b/metallama3_8b/limo/checkpoint-1845/rng_state_3.pth deleted file mode 100644 index adaf9621fc3ca0a14f99862b58c3bebc5b7168e3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6f6049e212b1df5cefc5d834afcd8cc052c73f1457449e9fe8a38d514f54078 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-1845/scheduler.pt b/metallama3_8b/limo/checkpoint-1845/scheduler.pt deleted file mode 100644 index d9a49b4affb17f84b3cf0ab0741cea54e68f1089..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cfc203ba281c2dae06329b6ba12c90aeb69177391b4b6dfd95fe39f735dd090e -size 1064 diff --git a/metallama3_8b/limo/checkpoint-1845/special_tokens_map.json b/metallama3_8b/limo/checkpoint-1845/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-1845/tokenizer.json b/metallama3_8b/limo/checkpoint-1845/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-1845/tokenizer_config.json b/metallama3_8b/limo/checkpoint-1845/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-1845/trainer_state.json b/metallama3_8b/limo/checkpoint-1845/trainer_state.json deleted file mode 100644 index 0d26e01049c9bf0aa3214bbddc4f1492bf30e823..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-1845/trainer_state.json +++ /dev/null @@ -1,12949 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 9.0, - "eval_steps": 500, - "global_step": 1845, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - }, - { - "epoch": 3.004878048780488, - "grad_norm": 2.787429094314575, - "learning_rate": 3.969463130731183e-06, - "loss": 0.2403, - "step": 616 - }, - { - "epoch": 3.0097560975609756, - "grad_norm": 3.0412075519561768, - "learning_rate": 3.966361891975316e-06, - "loss": 0.2635, - "step": 617 - }, - { - "epoch": 3.0146341463414634, - "grad_norm": 2.9949851036071777, - "learning_rate": 3.963257209457652e-06, - "loss": 0.3294, - "step": 618 - }, - { - "epoch": 3.0195121951219512, - "grad_norm": 3.0510809421539307, - "learning_rate": 3.960149090469561e-06, - "loss": 0.1338, - "step": 619 - }, - { - "epoch": 3.024390243902439, - "grad_norm": 3.669482707977295, - "learning_rate": 3.957037542310484e-06, - "loss": 0.1469, - "step": 620 - }, - { - "epoch": 3.029268292682927, - "grad_norm": 4.677116870880127, - "learning_rate": 3.953922572287915e-06, - "loss": 0.2788, - "step": 621 - }, - { - "epoch": 3.0341463414634147, - "grad_norm": 4.33144474029541, - "learning_rate": 3.950804187717384e-06, - "loss": 0.4521, - "step": 622 - }, - { - "epoch": 3.0390243902439025, - "grad_norm": 3.466639757156372, - "learning_rate": 3.947682395922439e-06, - "loss": 0.5113, - "step": 623 - }, - { - "epoch": 3.0439024390243903, - "grad_norm": 3.2332122325897217, - "learning_rate": 3.9445572042346346e-06, - "loss": 0.0968, - "step": 624 - }, - { - "epoch": 3.048780487804878, - "grad_norm": 2.6108055114746094, - "learning_rate": 3.941428619993505e-06, - "loss": 0.2462, - "step": 625 - }, - { - "epoch": 3.053658536585366, - "grad_norm": 3.2512595653533936, - "learning_rate": 3.938296650546552e-06, - "loss": 0.1782, - "step": 626 - }, - { - "epoch": 3.0585365853658537, - "grad_norm": 3.4350366592407227, - "learning_rate": 3.935161303249231e-06, - "loss": 0.2955, - "step": 627 - }, - { - "epoch": 3.0634146341463415, - "grad_norm": 3.42012619972229, - "learning_rate": 3.932022585464928e-06, - "loss": 0.3259, - "step": 628 - }, - { - "epoch": 3.0682926829268293, - "grad_norm": 3.458043336868286, - "learning_rate": 3.928880504564943e-06, - "loss": 0.2306, - "step": 629 - }, - { - "epoch": 3.073170731707317, - "grad_norm": 2.646616220474243, - "learning_rate": 3.92573506792848e-06, - "loss": 0.2197, - "step": 630 - }, - { - "epoch": 3.078048780487805, - "grad_norm": 3.5558857917785645, - "learning_rate": 3.9225862829426184e-06, - "loss": 0.1607, - "step": 631 - }, - { - "epoch": 3.0829268292682928, - "grad_norm": 3.6011338233947754, - "learning_rate": 3.919434157002303e-06, - "loss": 0.3087, - "step": 632 - }, - { - "epoch": 3.0878048780487806, - "grad_norm": 2.339879035949707, - "learning_rate": 3.916278697510325e-06, - "loss": 0.2213, - "step": 633 - }, - { - "epoch": 3.0926829268292684, - "grad_norm": 3.268162488937378, - "learning_rate": 3.913119911877305e-06, - "loss": 0.318, - "step": 634 - }, - { - "epoch": 3.097560975609756, - "grad_norm": 4.062571048736572, - "learning_rate": 3.909957807521674e-06, - "loss": 0.1757, - "step": 635 - }, - { - "epoch": 3.102439024390244, - "grad_norm": 2.997659683227539, - "learning_rate": 3.906792391869657e-06, - "loss": 0.2391, - "step": 636 - }, - { - "epoch": 3.107317073170732, - "grad_norm": 3.7037394046783447, - "learning_rate": 3.903623672355258e-06, - "loss": 0.2548, - "step": 637 - }, - { - "epoch": 3.1121951219512196, - "grad_norm": 3.110579252243042, - "learning_rate": 3.900451656420237e-06, - "loss": 0.2389, - "step": 638 - }, - { - "epoch": 3.1170731707317074, - "grad_norm": 3.3332321643829346, - "learning_rate": 3.897276351514097e-06, - "loss": 0.1371, - "step": 639 - }, - { - "epoch": 3.1219512195121952, - "grad_norm": 3.8275935649871826, - "learning_rate": 3.894097765094065e-06, - "loss": 0.3363, - "step": 640 - }, - { - "epoch": 3.126829268292683, - "grad_norm": 2.3731374740600586, - "learning_rate": 3.890915904625075e-06, - "loss": 0.1314, - "step": 641 - }, - { - "epoch": 3.131707317073171, - "grad_norm": 3.1511282920837402, - "learning_rate": 3.887730777579751e-06, - "loss": 0.3563, - "step": 642 - }, - { - "epoch": 3.1365853658536587, - "grad_norm": 4.2254862785339355, - "learning_rate": 3.884542391438387e-06, - "loss": 0.5053, - "step": 643 - }, - { - "epoch": 3.1414634146341465, - "grad_norm": 4.579670429229736, - "learning_rate": 3.88135075368893e-06, - "loss": 0.6259, - "step": 644 - }, - { - "epoch": 3.1463414634146343, - "grad_norm": 3.2102746963500977, - "learning_rate": 3.878155871826968e-06, - "loss": 0.2599, - "step": 645 - }, - { - "epoch": 3.151219512195122, - "grad_norm": 2.5569686889648438, - "learning_rate": 3.874957753355701e-06, - "loss": 0.2075, - "step": 646 - }, - { - "epoch": 3.15609756097561, - "grad_norm": 3.588925838470459, - "learning_rate": 3.8717564057859365e-06, - "loss": 0.4577, - "step": 647 - }, - { - "epoch": 3.1609756097560977, - "grad_norm": 3.6163878440856934, - "learning_rate": 3.868551836636063e-06, - "loss": 0.4023, - "step": 648 - }, - { - "epoch": 3.1658536585365855, - "grad_norm": 3.8688390254974365, - "learning_rate": 3.865344053432035e-06, - "loss": 0.1669, - "step": 649 - }, - { - "epoch": 3.1707317073170733, - "grad_norm": 3.419734001159668, - "learning_rate": 3.862133063707353e-06, - "loss": 0.2766, - "step": 650 - }, - { - "epoch": 3.175609756097561, - "grad_norm": 2.9860243797302246, - "learning_rate": 3.858918875003053e-06, - "loss": 0.1788, - "step": 651 - }, - { - "epoch": 3.180487804878049, - "grad_norm": 3.0619022846221924, - "learning_rate": 3.855701494867679e-06, - "loss": 0.224, - "step": 652 - }, - { - "epoch": 3.1853658536585368, - "grad_norm": 3.3668978214263916, - "learning_rate": 3.852480930857275e-06, - "loss": 0.4029, - "step": 653 - }, - { - "epoch": 3.1902439024390246, - "grad_norm": 3.543147563934326, - "learning_rate": 3.849257190535356e-06, - "loss": 0.2096, - "step": 654 - }, - { - "epoch": 3.1951219512195124, - "grad_norm": 3.793619155883789, - "learning_rate": 3.846030281472902e-06, - "loss": 0.5574, - "step": 655 - }, - { - "epoch": 3.2, - "grad_norm": 3.021289110183716, - "learning_rate": 3.842800211248333e-06, - "loss": 0.2233, - "step": 656 - }, - { - "epoch": 3.204878048780488, - "grad_norm": 4.582934856414795, - "learning_rate": 3.839566987447492e-06, - "loss": 0.3871, - "step": 657 - }, - { - "epoch": 3.209756097560976, - "grad_norm": 2.996340274810791, - "learning_rate": 3.8363306176636296e-06, - "loss": 0.4325, - "step": 658 - }, - { - "epoch": 3.2146341463414636, - "grad_norm": 3.3190877437591553, - "learning_rate": 3.833091109497384e-06, - "loss": 0.5321, - "step": 659 - }, - { - "epoch": 3.2195121951219514, - "grad_norm": 3.2532856464385986, - "learning_rate": 3.829848470556765e-06, - "loss": 0.1359, - "step": 660 - }, - { - "epoch": 3.2243902439024392, - "grad_norm": 2.7875044345855713, - "learning_rate": 3.8266027084571335e-06, - "loss": 0.3145, - "step": 661 - }, - { - "epoch": 3.229268292682927, - "grad_norm": 3.748253583908081, - "learning_rate": 3.823353830821187e-06, - "loss": 0.1252, - "step": 662 - }, - { - "epoch": 3.234146341463415, - "grad_norm": 2.858293294906616, - "learning_rate": 3.820101845278937e-06, - "loss": 0.2589, - "step": 663 - }, - { - "epoch": 3.2390243902439027, - "grad_norm": 3.7470967769622803, - "learning_rate": 3.816846759467696e-06, - "loss": 0.2594, - "step": 664 - }, - { - "epoch": 3.2439024390243905, - "grad_norm": 3.676196813583374, - "learning_rate": 3.8135885810320587e-06, - "loss": 0.2998, - "step": 665 - }, - { - "epoch": 3.2487804878048783, - "grad_norm": 3.0943140983581543, - "learning_rate": 3.810327317623881e-06, - "loss": 0.2238, - "step": 666 - }, - { - "epoch": 3.253658536585366, - "grad_norm": 3.5907349586486816, - "learning_rate": 3.8070629769022628e-06, - "loss": 0.3381, - "step": 667 - }, - { - "epoch": 3.258536585365854, - "grad_norm": 3.1195285320281982, - "learning_rate": 3.8037955665335335e-06, - "loss": 0.2407, - "step": 668 - }, - { - "epoch": 3.2634146341463417, - "grad_norm": 3.422292947769165, - "learning_rate": 3.800525094191231e-06, - "loss": 0.2957, - "step": 669 - }, - { - "epoch": 3.2682926829268295, - "grad_norm": 2.5264663696289062, - "learning_rate": 3.797251567556083e-06, - "loss": 0.2493, - "step": 670 - }, - { - "epoch": 3.2731707317073173, - "grad_norm": 3.350219964981079, - "learning_rate": 3.793974994315991e-06, - "loss": 0.1186, - "step": 671 - }, - { - "epoch": 3.278048780487805, - "grad_norm": 4.175906181335449, - "learning_rate": 3.790695382166013e-06, - "loss": 0.3453, - "step": 672 - }, - { - "epoch": 3.2829268292682925, - "grad_norm": 3.006072521209717, - "learning_rate": 3.7874127388083415e-06, - "loss": 0.1981, - "step": 673 - }, - { - "epoch": 3.2878048780487803, - "grad_norm": 3.368561029434204, - "learning_rate": 3.7841270719522895e-06, - "loss": 0.2934, - "step": 674 - }, - { - "epoch": 3.292682926829268, - "grad_norm": 4.374331951141357, - "learning_rate": 3.7808383893142692e-06, - "loss": 0.1359, - "step": 675 - }, - { - "epoch": 3.297560975609756, - "grad_norm": 3.297102451324463, - "learning_rate": 3.7775466986177763e-06, - "loss": 0.2498, - "step": 676 - }, - { - "epoch": 3.3024390243902437, - "grad_norm": 2.8914761543273926, - "learning_rate": 3.774252007593371e-06, - "loss": 0.1308, - "step": 677 - }, - { - "epoch": 3.3073170731707315, - "grad_norm": 3.1550722122192383, - "learning_rate": 3.7709543239786593e-06, - "loss": 0.3915, - "step": 678 - }, - { - "epoch": 3.3121951219512193, - "grad_norm": 3.2302658557891846, - "learning_rate": 3.767653655518277e-06, - "loss": 0.2558, - "step": 679 - }, - { - "epoch": 3.317073170731707, - "grad_norm": 4.4321770668029785, - "learning_rate": 3.7643500099638673e-06, - "loss": 0.1988, - "step": 680 - }, - { - "epoch": 3.321951219512195, - "grad_norm": 2.970566749572754, - "learning_rate": 3.7610433950740667e-06, - "loss": 0.4908, - "step": 681 - }, - { - "epoch": 3.3268292682926828, - "grad_norm": 3.5516228675842285, - "learning_rate": 3.757733818614485e-06, - "loss": 0.304, - "step": 682 - }, - { - "epoch": 3.3317073170731706, - "grad_norm": 2.7555387020111084, - "learning_rate": 3.7544212883576856e-06, - "loss": 0.2533, - "step": 683 - }, - { - "epoch": 3.3365853658536584, - "grad_norm": 3.61226749420166, - "learning_rate": 3.751105812083172e-06, - "loss": 0.1771, - "step": 684 - }, - { - "epoch": 3.341463414634146, - "grad_norm": 3.0466206073760986, - "learning_rate": 3.7477873975773655e-06, - "loss": 0.4213, - "step": 685 - }, - { - "epoch": 3.346341463414634, - "grad_norm": 3.6091527938842773, - "learning_rate": 3.7444660526335853e-06, - "loss": 0.3808, - "step": 686 - }, - { - "epoch": 3.351219512195122, - "grad_norm": 3.8443002700805664, - "learning_rate": 3.741141785052036e-06, - "loss": 0.6438, - "step": 687 - }, - { - "epoch": 3.3560975609756096, - "grad_norm": 3.845909833908081, - "learning_rate": 3.737814602639784e-06, - "loss": 0.3686, - "step": 688 - }, - { - "epoch": 3.3609756097560974, - "grad_norm": 2.904892921447754, - "learning_rate": 3.7344845132107427e-06, - "loss": 0.2934, - "step": 689 - }, - { - "epoch": 3.3658536585365852, - "grad_norm": 3.4766387939453125, - "learning_rate": 3.731151524585651e-06, - "loss": 0.3299, - "step": 690 - }, - { - "epoch": 3.370731707317073, - "grad_norm": 4.236767768859863, - "learning_rate": 3.7278156445920584e-06, - "loss": 0.6303, - "step": 691 - }, - { - "epoch": 3.375609756097561, - "grad_norm": 3.1122591495513916, - "learning_rate": 3.724476881064303e-06, - "loss": 0.2432, - "step": 692 - }, - { - "epoch": 3.3804878048780487, - "grad_norm": 3.0971457958221436, - "learning_rate": 3.721135241843496e-06, - "loss": 0.3131, - "step": 693 - }, - { - "epoch": 3.3853658536585365, - "grad_norm": 3.9365804195404053, - "learning_rate": 3.7177907347775016e-06, - "loss": 0.3372, - "step": 694 - }, - { - "epoch": 3.3902439024390243, - "grad_norm": 3.760373115539551, - "learning_rate": 3.71444336772092e-06, - "loss": 0.5055, - "step": 695 - }, - { - "epoch": 3.395121951219512, - "grad_norm": 4.360848426818848, - "learning_rate": 3.711093148535068e-06, - "loss": 0.6183, - "step": 696 - }, - { - "epoch": 3.4, - "grad_norm": 3.7713537216186523, - "learning_rate": 3.707740085087959e-06, - "loss": 0.1568, - "step": 697 - }, - { - "epoch": 3.4048780487804877, - "grad_norm": 3.8532230854034424, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.2826, - "step": 698 - }, - { - "epoch": 3.4097560975609755, - "grad_norm": 3.0548605918884277, - "learning_rate": 3.701025456915411e-06, - "loss": 0.1918, - "step": 699 - }, - { - "epoch": 3.4146341463414633, - "grad_norm": 3.2431821823120117, - "learning_rate": 3.697663907959327e-06, - "loss": 0.2493, - "step": 700 - }, - { - "epoch": 3.419512195121951, - "grad_norm": 3.7301864624023438, - "learning_rate": 3.6942995462806574e-06, - "loss": 0.4913, - "step": 701 - }, - { - "epoch": 3.424390243902439, - "grad_norm": 2.5468900203704834, - "learning_rate": 3.6909323797806314e-06, - "loss": 0.1788, - "step": 702 - }, - { - "epoch": 3.4292682926829268, - "grad_norm": 3.3719515800476074, - "learning_rate": 3.6875624163670635e-06, - "loss": 0.4162, - "step": 703 - }, - { - "epoch": 3.4341463414634146, - "grad_norm": 3.528010368347168, - "learning_rate": 3.6841896639543394e-06, - "loss": 0.1924, - "step": 704 - }, - { - "epoch": 3.4390243902439024, - "grad_norm": 3.3636631965637207, - "learning_rate": 3.6808141304633924e-06, - "loss": 0.3177, - "step": 705 - }, - { - "epoch": 3.44390243902439, - "grad_norm": 3.418705463409424, - "learning_rate": 3.6774358238216878e-06, - "loss": 0.2301, - "step": 706 - }, - { - "epoch": 3.448780487804878, - "grad_norm": 4.720373630523682, - "learning_rate": 3.6740547519632048e-06, - "loss": 0.1894, - "step": 707 - }, - { - "epoch": 3.453658536585366, - "grad_norm": 2.9635703563690186, - "learning_rate": 3.670670922828414e-06, - "loss": 0.2642, - "step": 708 - }, - { - "epoch": 3.4585365853658536, - "grad_norm": 4.934754371643066, - "learning_rate": 3.667284344364264e-06, - "loss": 0.2275, - "step": 709 - }, - { - "epoch": 3.4634146341463414, - "grad_norm": 3.090585231781006, - "learning_rate": 3.6638950245241604e-06, - "loss": 0.4447, - "step": 710 - }, - { - "epoch": 3.4682926829268292, - "grad_norm": 4.360495090484619, - "learning_rate": 3.660502971267945e-06, - "loss": 0.2415, - "step": 711 - }, - { - "epoch": 3.473170731707317, - "grad_norm": 3.4893476963043213, - "learning_rate": 3.65710819256188e-06, - "loss": 0.0921, - "step": 712 - }, - { - "epoch": 3.478048780487805, - "grad_norm": 3.2423770427703857, - "learning_rate": 3.65371069637863e-06, - "loss": 0.2371, - "step": 713 - }, - { - "epoch": 3.4829268292682927, - "grad_norm": 3.0775890350341797, - "learning_rate": 3.650310490697238e-06, - "loss": 0.4026, - "step": 714 - }, - { - "epoch": 3.4878048780487805, - "grad_norm": 3.906625270843506, - "learning_rate": 3.646907583503114e-06, - "loss": 0.4312, - "step": 715 - }, - { - "epoch": 3.4926829268292683, - "grad_norm": 3.2140414714813232, - "learning_rate": 3.6435019827880093e-06, - "loss": 0.2309, - "step": 716 - }, - { - "epoch": 3.497560975609756, - "grad_norm": 3.048523426055908, - "learning_rate": 3.640093696550003e-06, - "loss": 0.296, - "step": 717 - }, - { - "epoch": 3.502439024390244, - "grad_norm": 2.9669039249420166, - "learning_rate": 3.6366827327934817e-06, - "loss": 0.2723, - "step": 718 - }, - { - "epoch": 3.5073170731707317, - "grad_norm": 3.6941726207733154, - "learning_rate": 3.6332690995291176e-06, - "loss": 0.3797, - "step": 719 - }, - { - "epoch": 3.5121951219512195, - "grad_norm": 5.135766506195068, - "learning_rate": 3.6298528047738545e-06, - "loss": 0.9868, - "step": 720 - }, - { - "epoch": 3.5170731707317073, - "grad_norm": 3.2021052837371826, - "learning_rate": 3.626433856550886e-06, - "loss": 0.4069, - "step": 721 - }, - { - "epoch": 3.521951219512195, - "grad_norm": 3.094444513320923, - "learning_rate": 3.623012262889637e-06, - "loss": 0.3368, - "step": 722 - }, - { - "epoch": 3.526829268292683, - "grad_norm": 3.609285354614258, - "learning_rate": 3.6195880318257465e-06, - "loss": 0.3972, - "step": 723 - }, - { - "epoch": 3.5317073170731708, - "grad_norm": 4.236501216888428, - "learning_rate": 3.616161171401046e-06, - "loss": 0.52, - "step": 724 - }, - { - "epoch": 3.5365853658536586, - "grad_norm": 3.504526376724243, - "learning_rate": 3.612731689663542e-06, - "loss": 0.23, - "step": 725 - }, - { - "epoch": 3.5414634146341464, - "grad_norm": 3.233591079711914, - "learning_rate": 3.6092995946673996e-06, - "loss": 0.4151, - "step": 726 - }, - { - "epoch": 3.546341463414634, - "grad_norm": 3.6701886653900146, - "learning_rate": 3.605864894472918e-06, - "loss": 0.2798, - "step": 727 - }, - { - "epoch": 3.551219512195122, - "grad_norm": 3.8713181018829346, - "learning_rate": 3.602427597146516e-06, - "loss": 0.4336, - "step": 728 - }, - { - "epoch": 3.55609756097561, - "grad_norm": 5.49612283706665, - "learning_rate": 3.5989877107607134e-06, - "loss": 0.4803, - "step": 729 - }, - { - "epoch": 3.5609756097560976, - "grad_norm": 3.771005392074585, - "learning_rate": 3.5955452433941075e-06, - "loss": 0.3698, - "step": 730 - }, - { - "epoch": 3.5658536585365854, - "grad_norm": 2.970822334289551, - "learning_rate": 3.5921002031313586e-06, - "loss": 0.2373, - "step": 731 - }, - { - "epoch": 3.5707317073170732, - "grad_norm": 3.517249584197998, - "learning_rate": 3.58865259806317e-06, - "loss": 0.1908, - "step": 732 - }, - { - "epoch": 3.575609756097561, - "grad_norm": 3.6825428009033203, - "learning_rate": 3.585202436286267e-06, - "loss": 0.3993, - "step": 733 - }, - { - "epoch": 3.580487804878049, - "grad_norm": 3.387479066848755, - "learning_rate": 3.581749725903381e-06, - "loss": 0.4237, - "step": 734 - }, - { - "epoch": 3.5853658536585367, - "grad_norm": 3.5004806518554688, - "learning_rate": 3.5782944750232274e-06, - "loss": 0.3011, - "step": 735 - }, - { - "epoch": 3.5902439024390245, - "grad_norm": 3.461731433868408, - "learning_rate": 3.574836691760489e-06, - "loss": 0.0896, - "step": 736 - }, - { - "epoch": 3.5951219512195123, - "grad_norm": 3.9598381519317627, - "learning_rate": 3.571376384235795e-06, - "loss": 0.2751, - "step": 737 - }, - { - "epoch": 3.6, - "grad_norm": 4.053933143615723, - "learning_rate": 3.5679135605757035e-06, - "loss": 0.2086, - "step": 738 - }, - { - "epoch": 3.604878048780488, - "grad_norm": 2.9683544635772705, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1659, - "step": 739 - }, - { - "epoch": 3.6097560975609757, - "grad_norm": 3.6598448753356934, - "learning_rate": 3.5609803973850877e-06, - "loss": 0.2469, - "step": 740 - }, - { - "epoch": 3.6146341463414635, - "grad_norm": 3.449335813522339, - "learning_rate": 3.557510074137147e-06, - "loss": 0.375, - "step": 741 - }, - { - "epoch": 3.6195121951219513, - "grad_norm": 2.7666923999786377, - "learning_rate": 3.554037267318942e-06, - "loss": 0.3133, - "step": 742 - }, - { - "epoch": 3.624390243902439, - "grad_norm": 2.8951869010925293, - "learning_rate": 3.5505619850863847e-06, - "loss": 0.2243, - "step": 743 - }, - { - "epoch": 3.629268292682927, - "grad_norm": 3.477747678756714, - "learning_rate": 3.5470842356012007e-06, - "loss": 0.1321, - "step": 744 - }, - { - "epoch": 3.6341463414634148, - "grad_norm": 3.810480833053589, - "learning_rate": 3.5436040270309113e-06, - "loss": 0.361, - "step": 745 - }, - { - "epoch": 3.6390243902439026, - "grad_norm": 3.0730793476104736, - "learning_rate": 3.540121367548811e-06, - "loss": 0.1523, - "step": 746 - }, - { - "epoch": 3.6439024390243904, - "grad_norm": 3.6878390312194824, - "learning_rate": 3.5366362653339524e-06, - "loss": 0.4898, - "step": 747 - }, - { - "epoch": 3.648780487804878, - "grad_norm": 3.6432242393493652, - "learning_rate": 3.533148728571124e-06, - "loss": 0.1397, - "step": 748 - }, - { - "epoch": 3.653658536585366, - "grad_norm": 3.7047760486602783, - "learning_rate": 3.5296587654508317e-06, - "loss": 0.323, - "step": 749 - }, - { - "epoch": 3.658536585365854, - "grad_norm": 3.777132749557495, - "learning_rate": 3.526166384169279e-06, - "loss": 0.5577, - "step": 750 - }, - { - "epoch": 3.6634146341463416, - "grad_norm": 3.7970924377441406, - "learning_rate": 3.5226715929283507e-06, - "loss": 0.245, - "step": 751 - }, - { - "epoch": 3.6682926829268294, - "grad_norm": 2.8203537464141846, - "learning_rate": 3.519174399935588e-06, - "loss": 0.1619, - "step": 752 - }, - { - "epoch": 3.6731707317073172, - "grad_norm": 3.4040987491607666, - "learning_rate": 3.5156748134041767e-06, - "loss": 0.1047, - "step": 753 - }, - { - "epoch": 3.678048780487805, - "grad_norm": 3.927960157394409, - "learning_rate": 3.5121728415529203e-06, - "loss": 0.5713, - "step": 754 - }, - { - "epoch": 3.682926829268293, - "grad_norm": 3.3833277225494385, - "learning_rate": 3.5086684926062266e-06, - "loss": 0.2174, - "step": 755 - }, - { - "epoch": 3.68780487804878, - "grad_norm": 3.989307403564453, - "learning_rate": 3.505161774794085e-06, - "loss": 0.285, - "step": 756 - }, - { - "epoch": 3.692682926829268, - "grad_norm": 2.742429494857788, - "learning_rate": 3.5016526963520474e-06, - "loss": 0.1602, - "step": 757 - }, - { - "epoch": 3.697560975609756, - "grad_norm": 3.7082698345184326, - "learning_rate": 3.498141265521212e-06, - "loss": 0.666, - "step": 758 - }, - { - "epoch": 3.7024390243902436, - "grad_norm": 3.033196210861206, - "learning_rate": 3.4946274905481997e-06, - "loss": 0.2024, - "step": 759 - }, - { - "epoch": 3.7073170731707314, - "grad_norm": 3.7145371437072754, - "learning_rate": 3.4911113796851364e-06, - "loss": 0.2719, - "step": 760 - }, - { - "epoch": 3.7121951219512193, - "grad_norm": 3.580298900604248, - "learning_rate": 3.487592941189636e-06, - "loss": 0.1537, - "step": 761 - }, - { - "epoch": 3.717073170731707, - "grad_norm": 4.753757953643799, - "learning_rate": 3.484072183324776e-06, - "loss": 0.6149, - "step": 762 - }, - { - "epoch": 3.721951219512195, - "grad_norm": 3.5575687885284424, - "learning_rate": 3.4805491143590823e-06, - "loss": 0.4241, - "step": 763 - }, - { - "epoch": 3.7268292682926827, - "grad_norm": 3.215224266052246, - "learning_rate": 3.4770237425665103e-06, - "loss": 0.3037, - "step": 764 - }, - { - "epoch": 3.7317073170731705, - "grad_norm": 2.9899685382843018, - "learning_rate": 3.4734960762264204e-06, - "loss": 0.4854, - "step": 765 - }, - { - "epoch": 3.7365853658536583, - "grad_norm": 3.5880227088928223, - "learning_rate": 3.469966123623563e-06, - "loss": 0.3849, - "step": 766 - }, - { - "epoch": 3.741463414634146, - "grad_norm": 3.472750186920166, - "learning_rate": 3.46643389304806e-06, - "loss": 0.3159, - "step": 767 - }, - { - "epoch": 3.746341463414634, - "grad_norm": 4.355650901794434, - "learning_rate": 3.4628993927953786e-06, - "loss": 0.7527, - "step": 768 - }, - { - "epoch": 3.7512195121951217, - "grad_norm": 2.94575834274292, - "learning_rate": 3.45936263116632e-06, - "loss": 0.1716, - "step": 769 - }, - { - "epoch": 3.7560975609756095, - "grad_norm": 2.991525173187256, - "learning_rate": 3.4558236164669957e-06, - "loss": 0.2061, - "step": 770 - }, - { - "epoch": 3.7609756097560973, - "grad_norm": 3.134000301361084, - "learning_rate": 3.4522823570088073e-06, - "loss": 0.1338, - "step": 771 - }, - { - "epoch": 3.765853658536585, - "grad_norm": 3.722140312194824, - "learning_rate": 3.4487388611084295e-06, - "loss": 0.2615, - "step": 772 - }, - { - "epoch": 3.770731707317073, - "grad_norm": 3.7941153049468994, - "learning_rate": 3.445193137087788e-06, - "loss": 0.1401, - "step": 773 - }, - { - "epoch": 3.7756097560975608, - "grad_norm": 2.872941732406616, - "learning_rate": 3.4416451932740424e-06, - "loss": 0.2934, - "step": 774 - }, - { - "epoch": 3.7804878048780486, - "grad_norm": 4.5019941329956055, - "learning_rate": 3.4380950379995652e-06, - "loss": 0.4579, - "step": 775 - }, - { - "epoch": 3.7853658536585364, - "grad_norm": 2.682884931564331, - "learning_rate": 3.434542679601922e-06, - "loss": 0.2979, - "step": 776 - }, - { - "epoch": 3.790243902439024, - "grad_norm": 3.3044273853302, - "learning_rate": 3.4309881264238538e-06, - "loss": 0.1196, - "step": 777 - }, - { - "epoch": 3.795121951219512, - "grad_norm": 3.102760076522827, - "learning_rate": 3.4274313868132547e-06, - "loss": 0.2026, - "step": 778 - }, - { - "epoch": 3.8, - "grad_norm": 3.3304500579833984, - "learning_rate": 3.4238724691231534e-06, - "loss": 0.2135, - "step": 779 - }, - { - "epoch": 3.8048780487804876, - "grad_norm": 3.295119047164917, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.4418, - "step": 780 - }, - { - "epoch": 3.8097560975609754, - "grad_norm": 3.6655640602111816, - "learning_rate": 3.4167481329421204e-06, - "loss": 0.203, - "step": 781 - }, - { - "epoch": 3.8146341463414632, - "grad_norm": 3.387830972671509, - "learning_rate": 3.4131827311827447e-06, - "loss": 0.3225, - "step": 782 - }, - { - "epoch": 3.819512195121951, - "grad_norm": 2.621633529663086, - "learning_rate": 3.4096151848069416e-06, - "loss": 0.1704, - "step": 783 - }, - { - "epoch": 3.824390243902439, - "grad_norm": 2.974344491958618, - "learning_rate": 3.4060455021931195e-06, - "loss": 0.2785, - "step": 784 - }, - { - "epoch": 3.8292682926829267, - "grad_norm": 3.452131748199463, - "learning_rate": 3.402473691724704e-06, - "loss": 0.223, - "step": 785 - }, - { - "epoch": 3.8341463414634145, - "grad_norm": 2.6373705863952637, - "learning_rate": 3.39889976179012e-06, - "loss": 0.2368, - "step": 786 - }, - { - "epoch": 3.8390243902439023, - "grad_norm": 2.863184928894043, - "learning_rate": 3.3953237207827673e-06, - "loss": 0.3294, - "step": 787 - }, - { - "epoch": 3.84390243902439, - "grad_norm": 5.104704856872559, - "learning_rate": 3.391745577101005e-06, - "loss": 0.5431, - "step": 788 - }, - { - "epoch": 3.848780487804878, - "grad_norm": 3.951310634613037, - "learning_rate": 3.3881653391481306e-06, - "loss": 0.2546, - "step": 789 - }, - { - "epoch": 3.8536585365853657, - "grad_norm": 3.9903225898742676, - "learning_rate": 3.384583015332359e-06, - "loss": 0.3293, - "step": 790 - }, - { - "epoch": 3.8585365853658535, - "grad_norm": 3.3149220943450928, - "learning_rate": 3.380998614066805e-06, - "loss": 0.1861, - "step": 791 - }, - { - "epoch": 3.8634146341463413, - "grad_norm": 3.6755223274230957, - "learning_rate": 3.3774121437694606e-06, - "loss": 0.2498, - "step": 792 - }, - { - "epoch": 3.868292682926829, - "grad_norm": 3.192918300628662, - "learning_rate": 3.3738236128631786e-06, - "loss": 0.1525, - "step": 793 - }, - { - "epoch": 3.873170731707317, - "grad_norm": 3.5358777046203613, - "learning_rate": 3.3702330297756503e-06, - "loss": 0.3622, - "step": 794 - }, - { - "epoch": 3.8780487804878048, - "grad_norm": 3.619878053665161, - "learning_rate": 3.366640402939387e-06, - "loss": 0.1051, - "step": 795 - }, - { - "epoch": 3.8829268292682926, - "grad_norm": 7.085352420806885, - "learning_rate": 3.363045740791698e-06, - "loss": 0.4606, - "step": 796 - }, - { - "epoch": 3.8878048780487804, - "grad_norm": 2.523165464401245, - "learning_rate": 3.3594490517746774e-06, - "loss": 0.2267, - "step": 797 - }, - { - "epoch": 3.892682926829268, - "grad_norm": 2.7026922702789307, - "learning_rate": 3.3558503443351733e-06, - "loss": 0.2792, - "step": 798 - }, - { - "epoch": 3.897560975609756, - "grad_norm": 2.9232428073883057, - "learning_rate": 3.352249626924777e-06, - "loss": 0.2579, - "step": 799 - }, - { - "epoch": 3.902439024390244, - "grad_norm": 4.760788440704346, - "learning_rate": 3.348646907999801e-06, - "loss": 0.6983, - "step": 800 - }, - { - "epoch": 3.9073170731707316, - "grad_norm": 3.198249578475952, - "learning_rate": 3.345042196021257e-06, - "loss": 0.3265, - "step": 801 - }, - { - "epoch": 3.9121951219512194, - "grad_norm": 4.069286823272705, - "learning_rate": 3.3414354994548385e-06, - "loss": 0.497, - "step": 802 - }, - { - "epoch": 3.9170731707317072, - "grad_norm": 3.4435410499572754, - "learning_rate": 3.337826826770898e-06, - "loss": 0.2812, - "step": 803 - }, - { - "epoch": 3.921951219512195, - "grad_norm": 3.9805212020874023, - "learning_rate": 3.3342161864444312e-06, - "loss": 0.2277, - "step": 804 - }, - { - "epoch": 3.926829268292683, - "grad_norm": 3.348925828933716, - "learning_rate": 3.3306035869550534e-06, - "loss": 0.1614, - "step": 805 - }, - { - "epoch": 3.9317073170731707, - "grad_norm": 4.7613701820373535, - "learning_rate": 3.326989036786981e-06, - "loss": 0.3269, - "step": 806 - }, - { - "epoch": 3.9365853658536585, - "grad_norm": 3.807502508163452, - "learning_rate": 3.3233725444290126e-06, - "loss": 0.2619, - "step": 807 - }, - { - "epoch": 3.9414634146341463, - "grad_norm": 3.2690203189849854, - "learning_rate": 3.3197541183745065e-06, - "loss": 0.4334, - "step": 808 - }, - { - "epoch": 3.946341463414634, - "grad_norm": 3.396993398666382, - "learning_rate": 3.3161337671213634e-06, - "loss": 0.2738, - "step": 809 - }, - { - "epoch": 3.951219512195122, - "grad_norm": 3.086669921875, - "learning_rate": 3.312511499172006e-06, - "loss": 0.1597, - "step": 810 - }, - { - "epoch": 3.9560975609756097, - "grad_norm": 3.5688745975494385, - "learning_rate": 3.3088873230333562e-06, - "loss": 0.3195, - "step": 811 - }, - { - "epoch": 3.9609756097560975, - "grad_norm": 3.4843621253967285, - "learning_rate": 3.3052612472168193e-06, - "loss": 0.1865, - "step": 812 - }, - { - "epoch": 3.9658536585365853, - "grad_norm": 2.8479580879211426, - "learning_rate": 3.3016332802382618e-06, - "loss": 0.3108, - "step": 813 - }, - { - "epoch": 3.970731707317073, - "grad_norm": 3.3241543769836426, - "learning_rate": 3.2980034306179897e-06, - "loss": 0.2099, - "step": 814 - }, - { - "epoch": 3.975609756097561, - "grad_norm": 2.817675828933716, - "learning_rate": 3.294371706880733e-06, - "loss": 0.3073, - "step": 815 - }, - { - "epoch": 3.9804878048780488, - "grad_norm": 2.9535388946533203, - "learning_rate": 3.290738117555622e-06, - "loss": 0.2024, - "step": 816 - }, - { - "epoch": 3.9853658536585366, - "grad_norm": 5.021281719207764, - "learning_rate": 3.2871026711761666e-06, - "loss": 0.508, - "step": 817 - }, - { - "epoch": 3.9902439024390244, - "grad_norm": 3.3377649784088135, - "learning_rate": 3.2834653762802414e-06, - "loss": 0.2116, - "step": 818 - }, - { - "epoch": 3.995121951219512, - "grad_norm": 4.412073135375977, - "learning_rate": 3.2798262414100594e-06, - "loss": 0.2177, - "step": 819 - }, - { - "epoch": 4.0, - "grad_norm": 3.174323797225952, - "learning_rate": 3.2761852751121566e-06, - "loss": 0.1737, - "step": 820 - }, - { - "epoch": 4.004878048780488, - "grad_norm": 2.921494960784912, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2569, - "step": 821 - }, - { - "epoch": 4.009756097560976, - "grad_norm": 2.693495512008667, - "learning_rate": 3.2688978824408136e-06, - "loss": 0.1621, - "step": 822 - }, - { - "epoch": 4.014634146341463, - "grad_norm": 2.705796718597412, - "learning_rate": 3.2652514731818698e-06, - "loss": 0.1121, - "step": 823 - }, - { - "epoch": 4.019512195121951, - "grad_norm": 3.2621448040008545, - "learning_rate": 3.2616032667241564e-06, - "loss": 0.0835, - "step": 824 - }, - { - "epoch": 4.024390243902439, - "grad_norm": 3.6205084323883057, - "learning_rate": 3.257953271635513e-06, - "loss": 0.3731, - "step": 825 - }, - { - "epoch": 4.029268292682927, - "grad_norm": 3.2600371837615967, - "learning_rate": 3.2543014964879814e-06, - "loss": 0.1051, - "step": 826 - }, - { - "epoch": 4.034146341463415, - "grad_norm": 3.865178346633911, - "learning_rate": 3.250647949857781e-06, - "loss": 0.0916, - "step": 827 - }, - { - "epoch": 4.0390243902439025, - "grad_norm": 6.9700927734375, - "learning_rate": 3.2469926403252932e-06, - "loss": 0.4037, - "step": 828 - }, - { - "epoch": 4.04390243902439, - "grad_norm": 3.658712148666382, - "learning_rate": 3.2433355764750417e-06, - "loss": 0.0523, - "step": 829 - }, - { - "epoch": 4.048780487804878, - "grad_norm": 4.911301612854004, - "learning_rate": 3.2396767668956656e-06, - "loss": 0.2616, - "step": 830 - }, - { - "epoch": 4.053658536585366, - "grad_norm": 5.019360542297363, - "learning_rate": 3.2360162201799085e-06, - "loss": 0.195, - "step": 831 - }, - { - "epoch": 4.058536585365854, - "grad_norm": 3.493767261505127, - "learning_rate": 3.2323539449245906e-06, - "loss": 0.1245, - "step": 832 - }, - { - "epoch": 4.0634146341463415, - "grad_norm": 4.246248722076416, - "learning_rate": 3.2286899497305917e-06, - "loss": 0.1147, - "step": 833 - }, - { - "epoch": 4.068292682926829, - "grad_norm": 2.993704319000244, - "learning_rate": 3.2250242432028335e-06, - "loss": 0.2189, - "step": 834 - }, - { - "epoch": 4.073170731707317, - "grad_norm": 4.695023059844971, - "learning_rate": 3.221356833950254e-06, - "loss": 0.4685, - "step": 835 - }, - { - "epoch": 4.078048780487805, - "grad_norm": 2.777644634246826, - "learning_rate": 3.21768773058579e-06, - "loss": 0.1245, - "step": 836 - }, - { - "epoch": 4.082926829268293, - "grad_norm": 3.3545901775360107, - "learning_rate": 3.21401694172636e-06, - "loss": 0.1342, - "step": 837 - }, - { - "epoch": 4.087804878048781, - "grad_norm": 2.2222652435302734, - "learning_rate": 3.2103444759928383e-06, - "loss": 0.0484, - "step": 838 - }, - { - "epoch": 4.092682926829268, - "grad_norm": 2.580345630645752, - "learning_rate": 3.2066703420100377e-06, - "loss": 0.0592, - "step": 839 - }, - { - "epoch": 4.097560975609756, - "grad_norm": 3.8652923107147217, - "learning_rate": 3.2029945484066883e-06, - "loss": 0.2536, - "step": 840 - }, - { - "epoch": 4.102439024390244, - "grad_norm": 3.0441582202911377, - "learning_rate": 3.1993171038154203e-06, - "loss": 0.1221, - "step": 841 - }, - { - "epoch": 4.107317073170732, - "grad_norm": 2.2795114517211914, - "learning_rate": 3.1956380168727385e-06, - "loss": 0.1231, - "step": 842 - }, - { - "epoch": 4.11219512195122, - "grad_norm": 3.701009750366211, - "learning_rate": 3.191957296219007e-06, - "loss": 0.2144, - "step": 843 - }, - { - "epoch": 4.117073170731707, - "grad_norm": 3.452637195587158, - "learning_rate": 3.1882749504984247e-06, - "loss": 0.1026, - "step": 844 - }, - { - "epoch": 4.121951219512195, - "grad_norm": 2.4208810329437256, - "learning_rate": 3.1845909883590076e-06, - "loss": 0.1124, - "step": 845 - }, - { - "epoch": 4.126829268292683, - "grad_norm": 4.353063583374023, - "learning_rate": 3.180905418452569e-06, - "loss": 0.2804, - "step": 846 - }, - { - "epoch": 4.131707317073171, - "grad_norm": 3.1151084899902344, - "learning_rate": 3.1772182494346963e-06, - "loss": 0.1748, - "step": 847 - }, - { - "epoch": 4.136585365853659, - "grad_norm": 3.457940101623535, - "learning_rate": 3.1735294899647344e-06, - "loss": 0.1984, - "step": 848 - }, - { - "epoch": 4.1414634146341465, - "grad_norm": 3.3556935787200928, - "learning_rate": 3.169839148705762e-06, - "loss": 0.1332, - "step": 849 - }, - { - "epoch": 4.146341463414634, - "grad_norm": 3.5510823726654053, - "learning_rate": 3.1661472343245725e-06, - "loss": 0.4788, - "step": 850 - }, - { - "epoch": 4.151219512195122, - "grad_norm": 4.036712646484375, - "learning_rate": 3.162453755491655e-06, - "loss": 0.2437, - "step": 851 - }, - { - "epoch": 4.15609756097561, - "grad_norm": 4.417062282562256, - "learning_rate": 3.158758720881171e-06, - "loss": 0.203, - "step": 852 - }, - { - "epoch": 4.160975609756098, - "grad_norm": 3.920558214187622, - "learning_rate": 3.155062139170937e-06, - "loss": 0.1462, - "step": 853 - }, - { - "epoch": 4.1658536585365855, - "grad_norm": 6.472081661224365, - "learning_rate": 3.1513640190424034e-06, - "loss": 0.0972, - "step": 854 - }, - { - "epoch": 4.170731707317073, - "grad_norm": 3.975947141647339, - "learning_rate": 3.147664369180632e-06, - "loss": 0.1092, - "step": 855 - }, - { - "epoch": 4.175609756097561, - "grad_norm": 4.977376937866211, - "learning_rate": 3.143963198274278e-06, - "loss": 0.2215, - "step": 856 - }, - { - "epoch": 4.180487804878049, - "grad_norm": 3.595460891723633, - "learning_rate": 3.140260515015569e-06, - "loss": 0.1771, - "step": 857 - }, - { - "epoch": 4.185365853658537, - "grad_norm": 3.1085658073425293, - "learning_rate": 3.136556328100284e-06, - "loss": 0.1995, - "step": 858 - }, - { - "epoch": 4.190243902439025, - "grad_norm": 4.355626583099365, - "learning_rate": 3.132850646227734e-06, - "loss": 0.4048, - "step": 859 - }, - { - "epoch": 4.195121951219512, - "grad_norm": 3.8079614639282227, - "learning_rate": 3.12914347810074e-06, - "loss": 0.1914, - "step": 860 - }, - { - "epoch": 4.2, - "grad_norm": 3.725804328918457, - "learning_rate": 3.125434832425613e-06, - "loss": 0.1579, - "step": 861 - }, - { - "epoch": 4.204878048780488, - "grad_norm": 2.974649667739868, - "learning_rate": 3.121724717912138e-06, - "loss": 0.1814, - "step": 862 - }, - { - "epoch": 4.209756097560976, - "grad_norm": 3.6391279697418213, - "learning_rate": 3.118013143273542e-06, - "loss": 0.1481, - "step": 863 - }, - { - "epoch": 4.214634146341464, - "grad_norm": 3.216643810272217, - "learning_rate": 3.1143001172264893e-06, - "loss": 0.113, - "step": 864 - }, - { - "epoch": 4.219512195121951, - "grad_norm": 3.605855941772461, - "learning_rate": 3.1105856484910474e-06, - "loss": 0.1405, - "step": 865 - }, - { - "epoch": 4.224390243902439, - "grad_norm": 2.7186765670776367, - "learning_rate": 3.1068697457906736e-06, - "loss": 0.097, - "step": 866 - }, - { - "epoch": 4.229268292682927, - "grad_norm": 3.980973243713379, - "learning_rate": 3.1031524178521938e-06, - "loss": 0.2207, - "step": 867 - }, - { - "epoch": 4.234146341463415, - "grad_norm": 3.4623806476593018, - "learning_rate": 3.0994336734057804e-06, - "loss": 0.0552, - "step": 868 - }, - { - "epoch": 4.239024390243903, - "grad_norm": 3.7556748390197754, - "learning_rate": 3.0957135211849315e-06, - "loss": 0.1743, - "step": 869 - }, - { - "epoch": 4.2439024390243905, - "grad_norm": 3.3547914028167725, - "learning_rate": 3.0919919699264535e-06, - "loss": 0.1195, - "step": 870 - }, - { - "epoch": 4.248780487804878, - "grad_norm": 4.392014503479004, - "learning_rate": 3.0882690283704355e-06, - "loss": 0.6174, - "step": 871 - }, - { - "epoch": 4.253658536585366, - "grad_norm": 2.7031409740448, - "learning_rate": 3.084544705260234e-06, - "loss": 0.1359, - "step": 872 - }, - { - "epoch": 4.258536585365854, - "grad_norm": 2.3518481254577637, - "learning_rate": 3.080819009342451e-06, - "loss": 0.0786, - "step": 873 - }, - { - "epoch": 4.263414634146342, - "grad_norm": 2.636204481124878, - "learning_rate": 3.077091949366908e-06, - "loss": 0.0677, - "step": 874 - }, - { - "epoch": 4.2682926829268295, - "grad_norm": 2.8670942783355713, - "learning_rate": 3.073363534086636e-06, - "loss": 0.1084, - "step": 875 - }, - { - "epoch": 4.273170731707317, - "grad_norm": 2.7044737339019775, - "learning_rate": 3.0696337722578444e-06, - "loss": 0.0681, - "step": 876 - }, - { - "epoch": 4.278048780487805, - "grad_norm": 3.481539487838745, - "learning_rate": 3.0659026726399072e-06, - "loss": 0.2262, - "step": 877 - }, - { - "epoch": 4.282926829268293, - "grad_norm": 3.7746224403381348, - "learning_rate": 3.0621702439953393e-06, - "loss": 0.2169, - "step": 878 - }, - { - "epoch": 4.287804878048781, - "grad_norm": 3.6386263370513916, - "learning_rate": 3.0584364950897768e-06, - "loss": 0.0581, - "step": 879 - }, - { - "epoch": 4.2926829268292686, - "grad_norm": 3.389408588409424, - "learning_rate": 3.0547014346919574e-06, - "loss": 0.1687, - "step": 880 - }, - { - "epoch": 4.297560975609756, - "grad_norm": 3.6510157585144043, - "learning_rate": 3.0509650715736977e-06, - "loss": 0.1362, - "step": 881 - }, - { - "epoch": 4.302439024390244, - "grad_norm": 3.334210157394409, - "learning_rate": 3.0472274145098744e-06, - "loss": 0.1865, - "step": 882 - }, - { - "epoch": 4.307317073170732, - "grad_norm": 4.747341632843018, - "learning_rate": 3.0434884722784026e-06, - "loss": 0.2385, - "step": 883 - }, - { - "epoch": 4.31219512195122, - "grad_norm": 3.9266858100891113, - "learning_rate": 3.0397482536602168e-06, - "loss": 0.1004, - "step": 884 - }, - { - "epoch": 4.317073170731708, - "grad_norm": 2.984821081161499, - "learning_rate": 3.0360067674392475e-06, - "loss": 0.1469, - "step": 885 - }, - { - "epoch": 4.321951219512195, - "grad_norm": 2.6379380226135254, - "learning_rate": 3.0322640224024024e-06, - "loss": 0.0829, - "step": 886 - }, - { - "epoch": 4.326829268292683, - "grad_norm": 3.885495185852051, - "learning_rate": 3.0285200273395478e-06, - "loss": 0.2256, - "step": 887 - }, - { - "epoch": 4.331707317073171, - "grad_norm": 3.950394868850708, - "learning_rate": 3.024774791043481e-06, - "loss": 0.2402, - "step": 888 - }, - { - "epoch": 4.336585365853659, - "grad_norm": 4.147830963134766, - "learning_rate": 3.021028322309921e-06, - "loss": 0.2198, - "step": 889 - }, - { - "epoch": 4.341463414634147, - "grad_norm": 4.0821638107299805, - "learning_rate": 3.0172806299374734e-06, - "loss": 0.2304, - "step": 890 - }, - { - "epoch": 4.3463414634146345, - "grad_norm": 4.142312049865723, - "learning_rate": 3.0135317227276247e-06, - "loss": 0.2864, - "step": 891 - }, - { - "epoch": 4.351219512195122, - "grad_norm": 3.008504867553711, - "learning_rate": 3.0097816094847104e-06, - "loss": 0.2045, - "step": 892 - }, - { - "epoch": 4.35609756097561, - "grad_norm": 3.1674623489379883, - "learning_rate": 3.0060302990158984e-06, - "loss": 0.0864, - "step": 893 - }, - { - "epoch": 4.360975609756098, - "grad_norm": 3.3412492275238037, - "learning_rate": 3.002277800131171e-06, - "loss": 0.076, - "step": 894 - }, - { - "epoch": 4.365853658536586, - "grad_norm": 3.067330837249756, - "learning_rate": 2.998524121643298e-06, - "loss": 0.1724, - "step": 895 - }, - { - "epoch": 4.3707317073170735, - "grad_norm": 3.9015982151031494, - "learning_rate": 2.994769272367822e-06, - "loss": 0.2, - "step": 896 - }, - { - "epoch": 4.375609756097561, - "grad_norm": 3.0136911869049072, - "learning_rate": 2.991013261123035e-06, - "loss": 0.0852, - "step": 897 - }, - { - "epoch": 4.380487804878049, - "grad_norm": 3.6834237575531006, - "learning_rate": 2.9872560967299554e-06, - "loss": 0.1449, - "step": 898 - }, - { - "epoch": 4.385365853658537, - "grad_norm": 3.3486039638519287, - "learning_rate": 2.9834977880123132e-06, - "loss": 0.0659, - "step": 899 - }, - { - "epoch": 4.390243902439025, - "grad_norm": 2.971315622329712, - "learning_rate": 2.9797383437965243e-06, - "loss": 0.1114, - "step": 900 - }, - { - "epoch": 4.3951219512195125, - "grad_norm": 2.683359146118164, - "learning_rate": 2.975977772911671e-06, - "loss": 0.0822, - "step": 901 - }, - { - "epoch": 4.4, - "grad_norm": 2.9941935539245605, - "learning_rate": 2.972216084189482e-06, - "loss": 0.0858, - "step": 902 - }, - { - "epoch": 4.404878048780488, - "grad_norm": 2.4938626289367676, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.1162, - "step": 903 - }, - { - "epoch": 4.409756097560976, - "grad_norm": 2.9364712238311768, - "learning_rate": 2.964689388573118e-06, - "loss": 0.0821, - "step": 904 - }, - { - "epoch": 4.414634146341464, - "grad_norm": 3.3638134002685547, - "learning_rate": 2.9609243993554434e-06, - "loss": 0.25, - "step": 905 - }, - { - "epoch": 4.419512195121952, - "grad_norm": 3.657277822494507, - "learning_rate": 2.9571583276533923e-06, - "loss": 0.0852, - "step": 906 - }, - { - "epoch": 4.424390243902439, - "grad_norm": 5.486263275146484, - "learning_rate": 2.9533911823116124e-06, - "loss": 0.5123, - "step": 907 - }, - { - "epoch": 4.429268292682927, - "grad_norm": 5.194574356079102, - "learning_rate": 2.9496229721772734e-06, - "loss": 0.1854, - "step": 908 - }, - { - "epoch": 4.434146341463415, - "grad_norm": 3.520110845565796, - "learning_rate": 2.9458537061000435e-06, - "loss": 0.1785, - "step": 909 - }, - { - "epoch": 4.439024390243903, - "grad_norm": 3.417991876602173, - "learning_rate": 2.9420833929320726e-06, - "loss": 0.1603, - "step": 910 - }, - { - "epoch": 4.443902439024391, - "grad_norm": 5.225805282592773, - "learning_rate": 2.93831204152797e-06, - "loss": 0.3046, - "step": 911 - }, - { - "epoch": 4.4487804878048784, - "grad_norm": 3.541433572769165, - "learning_rate": 2.9345396607447807e-06, - "loss": 0.0631, - "step": 912 - }, - { - "epoch": 4.453658536585366, - "grad_norm": 3.909377098083496, - "learning_rate": 2.9307662594419704e-06, - "loss": 0.125, - "step": 913 - }, - { - "epoch": 4.458536585365854, - "grad_norm": 3.6604416370391846, - "learning_rate": 2.9269918464814e-06, - "loss": 0.156, - "step": 914 - }, - { - "epoch": 4.463414634146342, - "grad_norm": 3.7413833141326904, - "learning_rate": 2.923216430727306e-06, - "loss": 0.3334, - "step": 915 - }, - { - "epoch": 4.46829268292683, - "grad_norm": 3.531996011734009, - "learning_rate": 2.9194400210462808e-06, - "loss": 0.2534, - "step": 916 - }, - { - "epoch": 4.473170731707317, - "grad_norm": 4.163621425628662, - "learning_rate": 2.91566262630725e-06, - "loss": 0.352, - "step": 917 - }, - { - "epoch": 4.478048780487805, - "grad_norm": 3.923635482788086, - "learning_rate": 2.9118842553814526e-06, - "loss": 0.1132, - "step": 918 - }, - { - "epoch": 4.482926829268292, - "grad_norm": 2.833768844604492, - "learning_rate": 2.9081049171424223e-06, - "loss": 0.086, - "step": 919 - }, - { - "epoch": 4.487804878048781, - "grad_norm": 2.9006292819976807, - "learning_rate": 2.9043246204659624e-06, - "loss": 0.0693, - "step": 920 - }, - { - "epoch": 4.492682926829268, - "grad_norm": 3.699376344680786, - "learning_rate": 2.9005433742301274e-06, - "loss": 0.2463, - "step": 921 - }, - { - "epoch": 4.4975609756097565, - "grad_norm": 4.882141590118408, - "learning_rate": 2.8967611873152037e-06, - "loss": 0.2275, - "step": 922 - }, - { - "epoch": 4.5024390243902435, - "grad_norm": 3.0554678440093994, - "learning_rate": 2.892978068603683e-06, - "loss": 0.0752, - "step": 923 - }, - { - "epoch": 4.507317073170732, - "grad_norm": 3.1225268840789795, - "learning_rate": 2.889194026980249e-06, - "loss": 0.1649, - "step": 924 - }, - { - "epoch": 4.512195121951219, - "grad_norm": 17.75234031677246, - "learning_rate": 2.8854090713317514e-06, - "loss": 0.0437, - "step": 925 - }, - { - "epoch": 4.517073170731708, - "grad_norm": 3.011223554611206, - "learning_rate": 2.8816232105471864e-06, - "loss": 0.0747, - "step": 926 - }, - { - "epoch": 4.521951219512195, - "grad_norm": 4.327573299407959, - "learning_rate": 2.877836453517677e-06, - "loss": 0.3884, - "step": 927 - }, - { - "epoch": 4.526829268292683, - "grad_norm": 3.8694965839385986, - "learning_rate": 2.8740488091364492e-06, - "loss": 0.2741, - "step": 928 - }, - { - "epoch": 4.53170731707317, - "grad_norm": 5.375877380371094, - "learning_rate": 2.870260286298814e-06, - "loss": 0.364, - "step": 929 - }, - { - "epoch": 4.536585365853659, - "grad_norm": 3.380891799926758, - "learning_rate": 2.866470893902147e-06, - "loss": 0.1495, - "step": 930 - }, - { - "epoch": 4.541463414634146, - "grad_norm": 3.723992109298706, - "learning_rate": 2.8626806408458626e-06, - "loss": 0.1403, - "step": 931 - }, - { - "epoch": 4.546341463414635, - "grad_norm": 3.0534417629241943, - "learning_rate": 2.8588895360313983e-06, - "loss": 0.0946, - "step": 932 - }, - { - "epoch": 4.5512195121951216, - "grad_norm": 2.8875234127044678, - "learning_rate": 2.8550975883621935e-06, - "loss": 0.1851, - "step": 933 - }, - { - "epoch": 4.55609756097561, - "grad_norm": 3.532166004180908, - "learning_rate": 2.8513048067436644e-06, - "loss": 0.178, - "step": 934 - }, - { - "epoch": 4.560975609756097, - "grad_norm": 2.942798376083374, - "learning_rate": 2.847511200083187e-06, - "loss": 0.1131, - "step": 935 - }, - { - "epoch": 4.565853658536585, - "grad_norm": 2.926874876022339, - "learning_rate": 2.843716777290074e-06, - "loss": 0.1251, - "step": 936 - }, - { - "epoch": 4.570731707317073, - "grad_norm": 3.525895357131958, - "learning_rate": 2.839921547275556e-06, - "loss": 0.0946, - "step": 937 - }, - { - "epoch": 4.575609756097561, - "grad_norm": 3.7033681869506836, - "learning_rate": 2.836125518952759e-06, - "loss": 0.1529, - "step": 938 - }, - { - "epoch": 4.580487804878048, - "grad_norm": 3.235154867172241, - "learning_rate": 2.8323287012366845e-06, - "loss": 0.2511, - "step": 939 - }, - { - "epoch": 4.585365853658536, - "grad_norm": 3.5275583267211914, - "learning_rate": 2.828531103044186e-06, - "loss": 0.1474, - "step": 940 - }, - { - "epoch": 4.590243902439024, - "grad_norm": 3.1356353759765625, - "learning_rate": 2.8247327332939512e-06, - "loss": 0.2249, - "step": 941 - }, - { - "epoch": 4.595121951219512, - "grad_norm": 3.789210081100464, - "learning_rate": 2.82093360090648e-06, - "loss": 0.2258, - "step": 942 - }, - { - "epoch": 4.6, - "grad_norm": 4.841623306274414, - "learning_rate": 2.8171337148040636e-06, - "loss": 0.2235, - "step": 943 - }, - { - "epoch": 4.6048780487804875, - "grad_norm": 3.161630630493164, - "learning_rate": 2.813333083910761e-06, - "loss": 0.1562, - "step": 944 - }, - { - "epoch": 4.609756097560975, - "grad_norm": 2.8718132972717285, - "learning_rate": 2.8095317171523835e-06, - "loss": 0.0625, - "step": 945 - }, - { - "epoch": 4.614634146341463, - "grad_norm": 3.6432454586029053, - "learning_rate": 2.805729623456469e-06, - "loss": 0.2205, - "step": 946 - }, - { - "epoch": 4.619512195121951, - "grad_norm": 4.382034778594971, - "learning_rate": 2.8019268117522624e-06, - "loss": 0.3241, - "step": 947 - }, - { - "epoch": 4.624390243902439, - "grad_norm": 3.2998175621032715, - "learning_rate": 2.798123290970695e-06, - "loss": 0.1983, - "step": 948 - }, - { - "epoch": 4.6292682926829265, - "grad_norm": 3.8665990829467773, - "learning_rate": 2.794319070044365e-06, - "loss": 0.3391, - "step": 949 - }, - { - "epoch": 4.634146341463414, - "grad_norm": 3.628403425216675, - "learning_rate": 2.790514157907512e-06, - "loss": 0.1329, - "step": 950 - }, - { - "epoch": 4.639024390243902, - "grad_norm": 2.8889615535736084, - "learning_rate": 2.786708563496002e-06, - "loss": 0.141, - "step": 951 - }, - { - "epoch": 4.64390243902439, - "grad_norm": 4.07351541519165, - "learning_rate": 2.782902295747299e-06, - "loss": 0.2935, - "step": 952 - }, - { - "epoch": 4.648780487804878, - "grad_norm": 4.220067024230957, - "learning_rate": 2.7790953636004536e-06, - "loss": 0.318, - "step": 953 - }, - { - "epoch": 4.6536585365853655, - "grad_norm": 3.8444325923919678, - "learning_rate": 2.775287775996074e-06, - "loss": 0.3388, - "step": 954 - }, - { - "epoch": 4.658536585365853, - "grad_norm": 3.197313070297241, - "learning_rate": 2.7714795418763067e-06, - "loss": 0.0925, - "step": 955 - }, - { - "epoch": 4.663414634146341, - "grad_norm": 4.0050811767578125, - "learning_rate": 2.7676706701848187e-06, - "loss": 0.2811, - "step": 956 - }, - { - "epoch": 4.668292682926829, - "grad_norm": 3.217160224914551, - "learning_rate": 2.763861169866774e-06, - "loss": 0.311, - "step": 957 - }, - { - "epoch": 4.673170731707317, - "grad_norm": 2.9892494678497314, - "learning_rate": 2.7600510498688104e-06, - "loss": 0.0582, - "step": 958 - }, - { - "epoch": 4.678048780487805, - "grad_norm": 3.954805374145508, - "learning_rate": 2.7562403191390246e-06, - "loss": 0.1238, - "step": 959 - }, - { - "epoch": 4.682926829268292, - "grad_norm": 2.9582695960998535, - "learning_rate": 2.7524289866269467e-06, - "loss": 0.1243, - "step": 960 - }, - { - "epoch": 4.68780487804878, - "grad_norm": 2.807002544403076, - "learning_rate": 2.748617061283518e-06, - "loss": 0.1388, - "step": 961 - }, - { - "epoch": 4.692682926829268, - "grad_norm": 3.980499505996704, - "learning_rate": 2.744804552061074e-06, - "loss": 0.1144, - "step": 962 - }, - { - "epoch": 4.697560975609756, - "grad_norm": 3.6389007568359375, - "learning_rate": 2.740991467913321e-06, - "loss": 0.2155, - "step": 963 - }, - { - "epoch": 4.702439024390244, - "grad_norm": 3.0950801372528076, - "learning_rate": 2.737177817795315e-06, - "loss": 0.0983, - "step": 964 - }, - { - "epoch": 4.7073170731707314, - "grad_norm": 3.1723053455352783, - "learning_rate": 2.7333636106634414e-06, - "loss": 0.1365, - "step": 965 - }, - { - "epoch": 4.712195121951219, - "grad_norm": 3.83921217918396, - "learning_rate": 2.7295488554753957e-06, - "loss": 0.1977, - "step": 966 - }, - { - "epoch": 4.717073170731707, - "grad_norm": 3.348057746887207, - "learning_rate": 2.725733561190157e-06, - "loss": 0.1311, - "step": 967 - }, - { - "epoch": 4.721951219512195, - "grad_norm": 3.828483819961548, - "learning_rate": 2.721917736767973e-06, - "loss": 0.2464, - "step": 968 - }, - { - "epoch": 4.726829268292683, - "grad_norm": 2.6004624366760254, - "learning_rate": 2.7181013911703357e-06, - "loss": 0.1088, - "step": 969 - }, - { - "epoch": 4.7317073170731705, - "grad_norm": 3.316990852355957, - "learning_rate": 2.714284533359961e-06, - "loss": 0.1492, - "step": 970 - }, - { - "epoch": 4.736585365853658, - "grad_norm": 3.8770010471343994, - "learning_rate": 2.710467172300768e-06, - "loss": 0.218, - "step": 971 - }, - { - "epoch": 4.741463414634146, - "grad_norm": 4.456376552581787, - "learning_rate": 2.706649316957857e-06, - "loss": 0.2199, - "step": 972 - }, - { - "epoch": 4.746341463414634, - "grad_norm": 3.3376309871673584, - "learning_rate": 2.7028309762974897e-06, - "loss": 0.0595, - "step": 973 - }, - { - "epoch": 4.751219512195122, - "grad_norm": 3.6755495071411133, - "learning_rate": 2.699012159287069e-06, - "loss": 0.1653, - "step": 974 - }, - { - "epoch": 4.7560975609756095, - "grad_norm": 2.939887046813965, - "learning_rate": 2.6951928748951125e-06, - "loss": 0.0681, - "step": 975 - }, - { - "epoch": 4.760975609756097, - "grad_norm": 3.4101195335388184, - "learning_rate": 2.69137313209124e-06, - "loss": 0.2046, - "step": 976 - }, - { - "epoch": 4.765853658536585, - "grad_norm": 3.9811208248138428, - "learning_rate": 2.687552939846145e-06, - "loss": 0.2255, - "step": 977 - }, - { - "epoch": 4.770731707317073, - "grad_norm": 3.484255313873291, - "learning_rate": 2.6837323071315766e-06, - "loss": 0.0512, - "step": 978 - }, - { - "epoch": 4.775609756097561, - "grad_norm": 3.9005143642425537, - "learning_rate": 2.679911242920321e-06, - "loss": 0.162, - "step": 979 - }, - { - "epoch": 4.780487804878049, - "grad_norm": 4.933374881744385, - "learning_rate": 2.6760897561861742e-06, - "loss": 0.398, - "step": 980 - }, - { - "epoch": 4.785365853658536, - "grad_norm": 3.0741539001464844, - "learning_rate": 2.672267855903927e-06, - "loss": 0.0507, - "step": 981 - }, - { - "epoch": 4.790243902439024, - "grad_norm": 3.023772716522217, - "learning_rate": 2.6684455510493413e-06, - "loss": 0.2066, - "step": 982 - }, - { - "epoch": 4.795121951219512, - "grad_norm": 3.0102407932281494, - "learning_rate": 2.6646228505991267e-06, - "loss": 0.2296, - "step": 983 - }, - { - "epoch": 4.8, - "grad_norm": 3.902200222015381, - "learning_rate": 2.6607997635309246e-06, - "loss": 0.14, - "step": 984 - }, - { - "epoch": 4.804878048780488, - "grad_norm": 3.836185932159424, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.1583, - "step": 985 - }, - { - "epoch": 4.809756097560975, - "grad_norm": 3.539628744125366, - "learning_rate": 2.653152465455639e-06, - "loss": 0.2619, - "step": 986 - }, - { - "epoch": 4.814634146341463, - "grad_norm": 4.716914653778076, - "learning_rate": 2.6493282724082913e-06, - "loss": 0.3029, - "step": 987 - }, - { - "epoch": 4.819512195121951, - "grad_norm": 3.466914176940918, - "learning_rate": 2.6455037286623864e-06, - "loss": 0.095, - "step": 988 - }, - { - "epoch": 4.824390243902439, - "grad_norm": 2.1798667907714844, - "learning_rate": 2.6416788431998935e-06, - "loss": 0.1232, - "step": 989 - }, - { - "epoch": 4.829268292682927, - "grad_norm": 3.309039354324341, - "learning_rate": 2.637853625003585e-06, - "loss": 0.3671, - "step": 990 - }, - { - "epoch": 4.8341463414634145, - "grad_norm": 3.2619435787200928, - "learning_rate": 2.6340280830570142e-06, - "loss": 0.194, - "step": 991 - }, - { - "epoch": 4.839024390243902, - "grad_norm": 3.601161003112793, - "learning_rate": 2.6302022263444947e-06, - "loss": 0.1214, - "step": 992 - }, - { - "epoch": 4.84390243902439, - "grad_norm": 4.13787841796875, - "learning_rate": 2.6263760638510793e-06, - "loss": 0.311, - "step": 993 - }, - { - "epoch": 4.848780487804878, - "grad_norm": 3.0474166870117188, - "learning_rate": 2.6225496045625394e-06, - "loss": 0.1853, - "step": 994 - }, - { - "epoch": 4.853658536585366, - "grad_norm": 4.481237411499023, - "learning_rate": 2.6187228574653428e-06, - "loss": 0.2088, - "step": 995 - }, - { - "epoch": 4.8585365853658535, - "grad_norm": 3.235966444015503, - "learning_rate": 2.614895831546633e-06, - "loss": 0.1439, - "step": 996 - }, - { - "epoch": 4.863414634146341, - "grad_norm": 4.103270053863525, - "learning_rate": 2.6110685357942096e-06, - "loss": 0.2823, - "step": 997 - }, - { - "epoch": 4.868292682926829, - "grad_norm": 4.134536266326904, - "learning_rate": 2.6072409791965048e-06, - "loss": 0.2963, - "step": 998 - }, - { - "epoch": 4.873170731707317, - "grad_norm": 4.124892711639404, - "learning_rate": 2.6034131707425638e-06, - "loss": 0.4127, - "step": 999 - }, - { - "epoch": 4.878048780487805, - "grad_norm": 3.565139055252075, - "learning_rate": 2.5995851194220223e-06, - "loss": 0.1601, - "step": 1000 - }, - { - "epoch": 4.882926829268293, - "grad_norm": 2.7548017501831055, - "learning_rate": 2.595756834225089e-06, - "loss": 0.161, - "step": 1001 - }, - { - "epoch": 4.88780487804878, - "grad_norm": 3.9297611713409424, - "learning_rate": 2.5919283241425188e-06, - "loss": 0.1013, - "step": 1002 - }, - { - "epoch": 4.892682926829268, - "grad_norm": 2.4904236793518066, - "learning_rate": 2.5880995981655965e-06, - "loss": 0.1177, - "step": 1003 - }, - { - "epoch": 4.897560975609756, - "grad_norm": 3.513308048248291, - "learning_rate": 2.584270665286113e-06, - "loss": 0.0682, - "step": 1004 - }, - { - "epoch": 4.902439024390244, - "grad_norm": 4.221067428588867, - "learning_rate": 2.580441534496346e-06, - "loss": 0.1502, - "step": 1005 - }, - { - "epoch": 4.907317073170732, - "grad_norm": 3.4298903942108154, - "learning_rate": 2.576612214789039e-06, - "loss": 0.1772, - "step": 1006 - }, - { - "epoch": 4.912195121951219, - "grad_norm": 4.402887344360352, - "learning_rate": 2.5727827151573747e-06, - "loss": 0.2029, - "step": 1007 - }, - { - "epoch": 4.917073170731707, - "grad_norm": 4.194999694824219, - "learning_rate": 2.568953044594964e-06, - "loss": 0.1269, - "step": 1008 - }, - { - "epoch": 4.921951219512195, - "grad_norm": 3.657607078552246, - "learning_rate": 2.5651232120958157e-06, - "loss": 0.1311, - "step": 1009 - }, - { - "epoch": 4.926829268292683, - "grad_norm": 4.092184543609619, - "learning_rate": 2.56129322665432e-06, - "loss": 0.1085, - "step": 1010 - }, - { - "epoch": 4.931707317073171, - "grad_norm": 3.3648242950439453, - "learning_rate": 2.5574630972652263e-06, - "loss": 0.0782, - "step": 1011 - }, - { - "epoch": 4.9365853658536585, - "grad_norm": 3.7215166091918945, - "learning_rate": 2.553632832923622e-06, - "loss": 0.1391, - "step": 1012 - }, - { - "epoch": 4.941463414634146, - "grad_norm": 4.045740127563477, - "learning_rate": 2.5498024426249107e-06, - "loss": 0.3141, - "step": 1013 - }, - { - "epoch": 4.946341463414634, - "grad_norm": 3.2363107204437256, - "learning_rate": 2.545971935364794e-06, - "loss": 0.0679, - "step": 1014 - }, - { - "epoch": 4.951219512195122, - "grad_norm": 3.057283639907837, - "learning_rate": 2.5421413201392443e-06, - "loss": 0.1382, - "step": 1015 - }, - { - "epoch": 4.95609756097561, - "grad_norm": 3.591535806655884, - "learning_rate": 2.538310605944491e-06, - "loss": 0.112, - "step": 1016 - }, - { - "epoch": 4.9609756097560975, - "grad_norm": 3.1629281044006348, - "learning_rate": 2.534479801776996e-06, - "loss": 0.1261, - "step": 1017 - }, - { - "epoch": 4.965853658536585, - "grad_norm": 2.691740036010742, - "learning_rate": 2.53064891663343e-06, - "loss": 0.2328, - "step": 1018 - }, - { - "epoch": 4.970731707317073, - "grad_norm": 3.2620503902435303, - "learning_rate": 2.526817959510655e-06, - "loss": 0.193, - "step": 1019 - }, - { - "epoch": 4.975609756097561, - "grad_norm": 3.0721535682678223, - "learning_rate": 2.5229869394057038e-06, - "loss": 0.2444, - "step": 1020 - }, - { - "epoch": 4.980487804878049, - "grad_norm": 2.6279208660125732, - "learning_rate": 2.5191558653157542e-06, - "loss": 0.1103, - "step": 1021 - }, - { - "epoch": 4.985365853658537, - "grad_norm": 2.9295670986175537, - "learning_rate": 2.515324746238113e-06, - "loss": 0.0553, - "step": 1022 - }, - { - "epoch": 4.990243902439024, - "grad_norm": 3.3960084915161133, - "learning_rate": 2.511493591170191e-06, - "loss": 0.1686, - "step": 1023 - }, - { - "epoch": 4.995121951219512, - "grad_norm": 4.138705253601074, - "learning_rate": 2.5076624091094846e-06, - "loss": 0.1208, - "step": 1024 - }, - { - "epoch": 5.0, - "grad_norm": 2.603870391845703, - "learning_rate": 2.503831209053554e-06, - "loss": 0.1216, - "step": 1025 - }, - { - "epoch": 5.004878048780488, - "grad_norm": 2.525205612182617, - "learning_rate": 2.5e-06, - "loss": 0.0984, - "step": 1026 - }, - { - "epoch": 5.009756097560976, - "grad_norm": 3.2502501010894775, - "learning_rate": 2.4961687909464462e-06, - "loss": 0.1323, - "step": 1027 - }, - { - "epoch": 5.014634146341463, - "grad_norm": 5.363409519195557, - "learning_rate": 2.492337590890516e-06, - "loss": 0.3516, - "step": 1028 - }, - { - "epoch": 5.019512195121951, - "grad_norm": 2.887723445892334, - "learning_rate": 2.4885064088298097e-06, - "loss": 0.1931, - "step": 1029 - }, - { - "epoch": 5.024390243902439, - "grad_norm": 3.4529435634613037, - "learning_rate": 2.4846752537618875e-06, - "loss": 0.0675, - "step": 1030 - }, - { - "epoch": 5.029268292682927, - "grad_norm": 4.202361106872559, - "learning_rate": 2.480844134684246e-06, - "loss": 0.1643, - "step": 1031 - }, - { - "epoch": 5.034146341463415, - "grad_norm": 2.910275459289551, - "learning_rate": 2.4770130605942966e-06, - "loss": 0.11, - "step": 1032 - }, - { - "epoch": 5.0390243902439025, - "grad_norm": 3.5430362224578857, - "learning_rate": 2.4731820404893457e-06, - "loss": 0.0614, - "step": 1033 - }, - { - "epoch": 5.04390243902439, - "grad_norm": 4.501879692077637, - "learning_rate": 2.469351083366571e-06, - "loss": 0.0954, - "step": 1034 - }, - { - "epoch": 5.048780487804878, - "grad_norm": 2.732261896133423, - "learning_rate": 2.4655201982230044e-06, - "loss": 0.0275, - "step": 1035 - }, - { - "epoch": 5.053658536585366, - "grad_norm": 3.5926437377929688, - "learning_rate": 2.4616893940555094e-06, - "loss": 0.0661, - "step": 1036 - }, - { - "epoch": 5.058536585365854, - "grad_norm": 4.790312767028809, - "learning_rate": 2.457858679860757e-06, - "loss": 0.2976, - "step": 1037 - }, - { - "epoch": 5.0634146341463415, - "grad_norm": 4.453246116638184, - "learning_rate": 2.4540280646352072e-06, - "loss": 0.1216, - "step": 1038 - }, - { - "epoch": 5.068292682926829, - "grad_norm": 3.288011074066162, - "learning_rate": 2.45019755737509e-06, - "loss": 0.0877, - "step": 1039 - }, - { - "epoch": 5.073170731707317, - "grad_norm": 3.566927671432495, - "learning_rate": 2.4463671670763787e-06, - "loss": 0.1661, - "step": 1040 - }, - { - "epoch": 5.078048780487805, - "grad_norm": 3.250047206878662, - "learning_rate": 2.4425369027347746e-06, - "loss": 0.211, - "step": 1041 - }, - { - "epoch": 5.082926829268293, - "grad_norm": 3.0214977264404297, - "learning_rate": 2.4387067733456804e-06, - "loss": 0.093, - "step": 1042 - }, - { - "epoch": 5.087804878048781, - "grad_norm": 3.8162097930908203, - "learning_rate": 2.4348767879041847e-06, - "loss": 0.0777, - "step": 1043 - }, - { - "epoch": 5.092682926829268, - "grad_norm": 3.8071560859680176, - "learning_rate": 2.4310469554050366e-06, - "loss": 0.087, - "step": 1044 - }, - { - "epoch": 5.097560975609756, - "grad_norm": 3.1032073497772217, - "learning_rate": 2.4272172848426257e-06, - "loss": 0.1105, - "step": 1045 - }, - { - "epoch": 5.102439024390244, - "grad_norm": 2.8980185985565186, - "learning_rate": 2.423387785210962e-06, - "loss": 0.0704, - "step": 1046 - }, - { - "epoch": 5.107317073170732, - "grad_norm": 3.9110755920410156, - "learning_rate": 2.4195584655036544e-06, - "loss": 0.2118, - "step": 1047 - }, - { - "epoch": 5.11219512195122, - "grad_norm": 2.678884506225586, - "learning_rate": 2.4157293347138877e-06, - "loss": 0.0664, - "step": 1048 - }, - { - "epoch": 5.117073170731707, - "grad_norm": 3.183046340942383, - "learning_rate": 2.4119004018344043e-06, - "loss": 0.1767, - "step": 1049 - }, - { - "epoch": 5.121951219512195, - "grad_norm": 3.9198925495147705, - "learning_rate": 2.408071675857482e-06, - "loss": 0.1288, - "step": 1050 - }, - { - "epoch": 5.126829268292683, - "grad_norm": 4.378621578216553, - "learning_rate": 2.404243165774912e-06, - "loss": 0.1724, - "step": 1051 - }, - { - "epoch": 5.131707317073171, - "grad_norm": 2.5509133338928223, - "learning_rate": 2.4004148805779785e-06, - "loss": 0.0382, - "step": 1052 - }, - { - "epoch": 5.136585365853659, - "grad_norm": 3.692396402359009, - "learning_rate": 2.3965868292574375e-06, - "loss": 0.0942, - "step": 1053 - }, - { - "epoch": 5.1414634146341465, - "grad_norm": 3.8537800312042236, - "learning_rate": 2.392759020803496e-06, - "loss": 0.0819, - "step": 1054 - }, - { - "epoch": 5.146341463414634, - "grad_norm": 4.02876091003418, - "learning_rate": 2.3889314642057916e-06, - "loss": 0.0866, - "step": 1055 - }, - { - "epoch": 5.151219512195122, - "grad_norm": 3.531857490539551, - "learning_rate": 2.3851041684533677e-06, - "loss": 0.1557, - "step": 1056 - }, - { - "epoch": 5.15609756097561, - "grad_norm": 2.231265068054199, - "learning_rate": 2.381277142534658e-06, - "loss": 0.0421, - "step": 1057 - }, - { - "epoch": 5.160975609756098, - "grad_norm": 3.159226894378662, - "learning_rate": 2.3774503954374614e-06, - "loss": 0.0395, - "step": 1058 - }, - { - "epoch": 5.1658536585365855, - "grad_norm": 3.0375123023986816, - "learning_rate": 2.373623936148921e-06, - "loss": 0.1869, - "step": 1059 - }, - { - "epoch": 5.170731707317073, - "grad_norm": 5.4905900955200195, - "learning_rate": 2.369797773655506e-06, - "loss": 0.1426, - "step": 1060 - }, - { - "epoch": 5.175609756097561, - "grad_norm": 2.8739638328552246, - "learning_rate": 2.3659719169429866e-06, - "loss": 0.0788, - "step": 1061 - }, - { - "epoch": 5.180487804878049, - "grad_norm": 2.612183094024658, - "learning_rate": 2.3621463749964153e-06, - "loss": 0.0449, - "step": 1062 - }, - { - "epoch": 5.185365853658537, - "grad_norm": 2.0573198795318604, - "learning_rate": 2.3583211568001073e-06, - "loss": 0.0264, - "step": 1063 - }, - { - "epoch": 5.190243902439025, - "grad_norm": 2.3667244911193848, - "learning_rate": 2.3544962713376144e-06, - "loss": 0.0507, - "step": 1064 - }, - { - "epoch": 5.195121951219512, - "grad_norm": 2.1223740577697754, - "learning_rate": 2.3506717275917095e-06, - "loss": 0.0576, - "step": 1065 - }, - { - "epoch": 5.2, - "grad_norm": 2.2630319595336914, - "learning_rate": 2.346847534544362e-06, - "loss": 0.0523, - "step": 1066 - }, - { - "epoch": 5.204878048780488, - "grad_norm": 3.201913595199585, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.0847, - "step": 1067 - }, - { - "epoch": 5.209756097560976, - "grad_norm": 2.2149481773376465, - "learning_rate": 2.3392002364690762e-06, - "loss": 0.0215, - "step": 1068 - }, - { - "epoch": 5.214634146341464, - "grad_norm": 4.425244331359863, - "learning_rate": 2.335377149400874e-06, - "loss": 0.1018, - "step": 1069 - }, - { - "epoch": 5.219512195121951, - "grad_norm": 4.548358917236328, - "learning_rate": 2.3315544489506596e-06, - "loss": 0.1485, - "step": 1070 - }, - { - "epoch": 5.224390243902439, - "grad_norm": 3.635796546936035, - "learning_rate": 2.3277321440960733e-06, - "loss": 0.111, - "step": 1071 - }, - { - "epoch": 5.229268292682927, - "grad_norm": 2.3180043697357178, - "learning_rate": 2.323910243813826e-06, - "loss": 0.0267, - "step": 1072 - }, - { - "epoch": 5.234146341463415, - "grad_norm": 3.675490379333496, - "learning_rate": 2.3200887570796798e-06, - "loss": 0.153, - "step": 1073 - }, - { - "epoch": 5.239024390243903, - "grad_norm": 2.883225202560425, - "learning_rate": 2.316267692868424e-06, - "loss": 0.0968, - "step": 1074 - }, - { - "epoch": 5.2439024390243905, - "grad_norm": 3.0320188999176025, - "learning_rate": 2.312447060153856e-06, - "loss": 0.0786, - "step": 1075 - }, - { - "epoch": 5.248780487804878, - "grad_norm": 2.682695150375366, - "learning_rate": 2.308626867908761e-06, - "loss": 0.0677, - "step": 1076 - }, - { - "epoch": 5.253658536585366, - "grad_norm": 3.941967010498047, - "learning_rate": 2.3048071251048884e-06, - "loss": 0.1059, - "step": 1077 - }, - { - "epoch": 5.258536585365854, - "grad_norm": 6.485599517822266, - "learning_rate": 2.300987840712932e-06, - "loss": 0.1331, - "step": 1078 - }, - { - "epoch": 5.263414634146342, - "grad_norm": 3.809269905090332, - "learning_rate": 2.297169023702511e-06, - "loss": 0.169, - "step": 1079 - }, - { - "epoch": 5.2682926829268295, - "grad_norm": 3.115626573562622, - "learning_rate": 2.2933506830421436e-06, - "loss": 0.1349, - "step": 1080 - }, - { - "epoch": 5.273170731707317, - "grad_norm": 2.2234909534454346, - "learning_rate": 2.2895328276992325e-06, - "loss": 0.0191, - "step": 1081 - }, - { - "epoch": 5.278048780487805, - "grad_norm": 3.896925926208496, - "learning_rate": 2.28571546664004e-06, - "loss": 0.1961, - "step": 1082 - }, - { - "epoch": 5.282926829268293, - "grad_norm": 2.4134509563446045, - "learning_rate": 2.281898608829665e-06, - "loss": 0.02, - "step": 1083 - }, - { - "epoch": 5.287804878048781, - "grad_norm": 2.7599191665649414, - "learning_rate": 2.2780822632320273e-06, - "loss": 0.0763, - "step": 1084 - }, - { - "epoch": 5.2926829268292686, - "grad_norm": 2.465637683868408, - "learning_rate": 2.2742664388098435e-06, - "loss": 0.0403, - "step": 1085 - }, - { - "epoch": 5.297560975609756, - "grad_norm": 2.4026618003845215, - "learning_rate": 2.270451144524605e-06, - "loss": 0.0982, - "step": 1086 - }, - { - "epoch": 5.302439024390244, - "grad_norm": 3.3339459896087646, - "learning_rate": 2.266636389336559e-06, - "loss": 0.09, - "step": 1087 - }, - { - "epoch": 5.307317073170732, - "grad_norm": 2.113255023956299, - "learning_rate": 2.262822182204686e-06, - "loss": 0.0267, - "step": 1088 - }, - { - "epoch": 5.31219512195122, - "grad_norm": 3.1760852336883545, - "learning_rate": 2.2590085320866798e-06, - "loss": 0.0295, - "step": 1089 - }, - { - "epoch": 5.317073170731708, - "grad_norm": 2.9674434661865234, - "learning_rate": 2.255195447938927e-06, - "loss": 0.0261, - "step": 1090 - }, - { - "epoch": 5.321951219512195, - "grad_norm": 3.4384074211120605, - "learning_rate": 2.251382938716482e-06, - "loss": 0.0936, - "step": 1091 - }, - { - "epoch": 5.326829268292683, - "grad_norm": 3.3814568519592285, - "learning_rate": 2.2475710133730533e-06, - "loss": 0.0426, - "step": 1092 - }, - { - "epoch": 5.331707317073171, - "grad_norm": 3.081317663192749, - "learning_rate": 2.243759680860975e-06, - "loss": 0.0799, - "step": 1093 - }, - { - "epoch": 5.336585365853659, - "grad_norm": 3.5608482360839844, - "learning_rate": 2.2399489501311896e-06, - "loss": 0.0906, - "step": 1094 - }, - { - "epoch": 5.341463414634147, - "grad_norm": 3.7886314392089844, - "learning_rate": 2.2361388301332265e-06, - "loss": 0.2152, - "step": 1095 - }, - { - "epoch": 5.3463414634146345, - "grad_norm": 1.9531102180480957, - "learning_rate": 2.2323293298151817e-06, - "loss": 0.0359, - "step": 1096 - }, - { - "epoch": 5.351219512195122, - "grad_norm": 2.2828023433685303, - "learning_rate": 2.2285204581236937e-06, - "loss": 0.0368, - "step": 1097 - }, - { - "epoch": 5.35609756097561, - "grad_norm": 3.110262870788574, - "learning_rate": 2.2247122240039268e-06, - "loss": 0.0426, - "step": 1098 - }, - { - "epoch": 5.360975609756098, - "grad_norm": 2.3293566703796387, - "learning_rate": 2.2209046363995464e-06, - "loss": 0.0223, - "step": 1099 - }, - { - "epoch": 5.365853658536586, - "grad_norm": 2.990884780883789, - "learning_rate": 2.217097704252701e-06, - "loss": 0.1276, - "step": 1100 - }, - { - "epoch": 5.3707317073170735, - "grad_norm": 2.568014144897461, - "learning_rate": 2.2132914365039993e-06, - "loss": 0.0639, - "step": 1101 - }, - { - "epoch": 5.375609756097561, - "grad_norm": 2.618478536605835, - "learning_rate": 2.2094858420924882e-06, - "loss": 0.0166, - "step": 1102 - }, - { - "epoch": 5.380487804878049, - "grad_norm": 4.526919364929199, - "learning_rate": 2.205680929955635e-06, - "loss": 0.144, - "step": 1103 - }, - { - "epoch": 5.385365853658537, - "grad_norm": 2.7236886024475098, - "learning_rate": 2.201876709029305e-06, - "loss": 0.1004, - "step": 1104 - }, - { - "epoch": 5.390243902439025, - "grad_norm": 2.1577632427215576, - "learning_rate": 2.198073188247738e-06, - "loss": 0.0453, - "step": 1105 - }, - { - "epoch": 5.3951219512195125, - "grad_norm": 2.5170321464538574, - "learning_rate": 2.1942703765435317e-06, - "loss": 0.0195, - "step": 1106 - }, - { - "epoch": 5.4, - "grad_norm": 3.962658643722534, - "learning_rate": 2.190468282847617e-06, - "loss": 0.1512, - "step": 1107 - }, - { - "epoch": 5.404878048780488, - "grad_norm": 4.297860622406006, - "learning_rate": 2.186666916089239e-06, - "loss": 0.2572, - "step": 1108 - }, - { - "epoch": 5.409756097560976, - "grad_norm": 2.8933565616607666, - "learning_rate": 2.1828662851959377e-06, - "loss": 0.0536, - "step": 1109 - }, - { - "epoch": 5.414634146341464, - "grad_norm": 2.9397451877593994, - "learning_rate": 2.1790663990935203e-06, - "loss": 0.0778, - "step": 1110 - }, - { - "epoch": 5.419512195121952, - "grad_norm": 3.5210094451904297, - "learning_rate": 2.1752672667060488e-06, - "loss": 0.0558, - "step": 1111 - }, - { - "epoch": 5.424390243902439, - "grad_norm": 2.9027626514434814, - "learning_rate": 2.1714688969558146e-06, - "loss": 0.041, - "step": 1112 - }, - { - "epoch": 5.429268292682927, - "grad_norm": 3.7691168785095215, - "learning_rate": 2.167671298763316e-06, - "loss": 0.1644, - "step": 1113 - }, - { - "epoch": 5.434146341463415, - "grad_norm": 3.493008852005005, - "learning_rate": 2.1638744810472414e-06, - "loss": 0.1587, - "step": 1114 - }, - { - "epoch": 5.439024390243903, - "grad_norm": 2.711196184158325, - "learning_rate": 2.1600784527244445e-06, - "loss": 0.0605, - "step": 1115 - }, - { - "epoch": 5.443902439024391, - "grad_norm": 4.365038871765137, - "learning_rate": 2.1562832227099266e-06, - "loss": 0.1897, - "step": 1116 - }, - { - "epoch": 5.4487804878048784, - "grad_norm": 4.621466159820557, - "learning_rate": 2.152488799916814e-06, - "loss": 0.1525, - "step": 1117 - }, - { - "epoch": 5.453658536585366, - "grad_norm": 4.8721089363098145, - "learning_rate": 2.148695193256336e-06, - "loss": 0.189, - "step": 1118 - }, - { - "epoch": 5.458536585365854, - "grad_norm": 2.8999173641204834, - "learning_rate": 2.1449024116378064e-06, - "loss": 0.095, - "step": 1119 - }, - { - "epoch": 5.463414634146342, - "grad_norm": 2.4865314960479736, - "learning_rate": 2.1411104639686013e-06, - "loss": 0.0432, - "step": 1120 - }, - { - "epoch": 5.46829268292683, - "grad_norm": 3.8497228622436523, - "learning_rate": 2.137319359154138e-06, - "loss": 0.0954, - "step": 1121 - }, - { - "epoch": 5.473170731707317, - "grad_norm": 2.3643507957458496, - "learning_rate": 2.133529106097853e-06, - "loss": 0.0362, - "step": 1122 - }, - { - "epoch": 5.478048780487805, - "grad_norm": 3.017826795578003, - "learning_rate": 2.1297397137011862e-06, - "loss": 0.0875, - "step": 1123 - }, - { - "epoch": 5.482926829268292, - "grad_norm": 3.239320755004883, - "learning_rate": 2.125951190863551e-06, - "loss": 0.0758, - "step": 1124 - }, - { - "epoch": 5.487804878048781, - "grad_norm": 2.566241979598999, - "learning_rate": 2.1221635464823237e-06, - "loss": 0.0605, - "step": 1125 - }, - { - "epoch": 5.492682926829268, - "grad_norm": 4.810088157653809, - "learning_rate": 2.1183767894528135e-06, - "loss": 0.2403, - "step": 1126 - }, - { - "epoch": 5.4975609756097565, - "grad_norm": 2.083263397216797, - "learning_rate": 2.114590928668249e-06, - "loss": 0.0223, - "step": 1127 - }, - { - "epoch": 5.5024390243902435, - "grad_norm": 2.6812374591827393, - "learning_rate": 2.1108059730197517e-06, - "loss": 0.0617, - "step": 1128 - }, - { - "epoch": 5.507317073170732, - "grad_norm": 3.196735143661499, - "learning_rate": 2.1070219313963173e-06, - "loss": 0.043, - "step": 1129 - }, - { - "epoch": 5.512195121951219, - "grad_norm": 2.775470495223999, - "learning_rate": 2.1032388126847967e-06, - "loss": 0.0595, - "step": 1130 - }, - { - "epoch": 5.517073170731708, - "grad_norm": 2.8632407188415527, - "learning_rate": 2.099456625769872e-06, - "loss": 0.0186, - "step": 1131 - }, - { - "epoch": 5.521951219512195, - "grad_norm": 4.075018405914307, - "learning_rate": 2.0956753795340376e-06, - "loss": 0.0616, - "step": 1132 - }, - { - "epoch": 5.526829268292683, - "grad_norm": 3.206327199935913, - "learning_rate": 2.091895082857578e-06, - "loss": 0.1895, - "step": 1133 - }, - { - "epoch": 5.53170731707317, - "grad_norm": 2.967588186264038, - "learning_rate": 2.0881157446185474e-06, - "loss": 0.0484, - "step": 1134 - }, - { - "epoch": 5.536585365853659, - "grad_norm": 2.850929021835327, - "learning_rate": 2.0843373736927506e-06, - "loss": 0.037, - "step": 1135 - }, - { - "epoch": 5.541463414634146, - "grad_norm": 2.2505147457122803, - "learning_rate": 2.08055997895372e-06, - "loss": 0.0227, - "step": 1136 - }, - { - "epoch": 5.546341463414635, - "grad_norm": 2.5258476734161377, - "learning_rate": 2.0767835692726944e-06, - "loss": 0.0296, - "step": 1137 - }, - { - "epoch": 5.5512195121951216, - "grad_norm": 3.498741388320923, - "learning_rate": 2.0730081535186e-06, - "loss": 0.16, - "step": 1138 - }, - { - "epoch": 5.55609756097561, - "grad_norm": 2.8635222911834717, - "learning_rate": 2.06923374055803e-06, - "loss": 0.0725, - "step": 1139 - }, - { - "epoch": 5.560975609756097, - "grad_norm": 2.2779290676116943, - "learning_rate": 2.0654603392552193e-06, - "loss": 0.0198, - "step": 1140 - }, - { - "epoch": 5.565853658536585, - "grad_norm": 3.1651058197021484, - "learning_rate": 2.0616879584720305e-06, - "loss": 0.1144, - "step": 1141 - }, - { - "epoch": 5.570731707317073, - "grad_norm": 2.4238595962524414, - "learning_rate": 2.057916607067928e-06, - "loss": 0.0491, - "step": 1142 - }, - { - "epoch": 5.575609756097561, - "grad_norm": 2.3248515129089355, - "learning_rate": 2.054146293899957e-06, - "loss": 0.035, - "step": 1143 - }, - { - "epoch": 5.580487804878048, - "grad_norm": 2.9506516456604004, - "learning_rate": 2.0503770278227274e-06, - "loss": 0.0639, - "step": 1144 - }, - { - "epoch": 5.585365853658536, - "grad_norm": 2.6403958797454834, - "learning_rate": 2.0466088176883876e-06, - "loss": 0.0258, - "step": 1145 - }, - { - "epoch": 5.590243902439024, - "grad_norm": 3.150115728378296, - "learning_rate": 2.042841672346608e-06, - "loss": 0.0634, - "step": 1146 - }, - { - "epoch": 5.595121951219512, - "grad_norm": 2.742691993713379, - "learning_rate": 2.039075600644557e-06, - "loss": 0.0464, - "step": 1147 - }, - { - "epoch": 5.6, - "grad_norm": 2.733694076538086, - "learning_rate": 2.0353106114268824e-06, - "loss": 0.0829, - "step": 1148 - }, - { - "epoch": 5.6048780487804875, - "grad_norm": 2.511229991912842, - "learning_rate": 2.031546713535688e-06, - "loss": 0.0321, - "step": 1149 - }, - { - "epoch": 5.609756097560975, - "grad_norm": 3.019669532775879, - "learning_rate": 2.027783915810518e-06, - "loss": 0.05, - "step": 1150 - }, - { - "epoch": 5.614634146341463, - "grad_norm": 3.497159242630005, - "learning_rate": 2.024022227088329e-06, - "loss": 0.1984, - "step": 1151 - }, - { - "epoch": 5.619512195121951, - "grad_norm": 3.4637508392333984, - "learning_rate": 2.020261656203476e-06, - "loss": 0.1673, - "step": 1152 - }, - { - "epoch": 5.624390243902439, - "grad_norm": 2.4312477111816406, - "learning_rate": 2.016502211987687e-06, - "loss": 0.1106, - "step": 1153 - }, - { - "epoch": 5.6292682926829265, - "grad_norm": 2.7801673412323, - "learning_rate": 2.0127439032700446e-06, - "loss": 0.0374, - "step": 1154 - }, - { - "epoch": 5.634146341463414, - "grad_norm": 2.9346680641174316, - "learning_rate": 2.0089867388769664e-06, - "loss": 0.0674, - "step": 1155 - }, - { - "epoch": 5.639024390243902, - "grad_norm": 2.274888277053833, - "learning_rate": 2.0052307276321793e-06, - "loss": 0.0365, - "step": 1156 - }, - { - "epoch": 5.64390243902439, - "grad_norm": 3.069890022277832, - "learning_rate": 2.001475878356703e-06, - "loss": 0.0758, - "step": 1157 - }, - { - "epoch": 5.648780487804878, - "grad_norm": 3.8594915866851807, - "learning_rate": 1.99772219986883e-06, - "loss": 0.176, - "step": 1158 - }, - { - "epoch": 5.6536585365853655, - "grad_norm": 3.4886410236358643, - "learning_rate": 1.9939697009841024e-06, - "loss": 0.0491, - "step": 1159 - }, - { - "epoch": 5.658536585365853, - "grad_norm": 2.697946786880493, - "learning_rate": 1.990218390515291e-06, - "loss": 0.0741, - "step": 1160 - }, - { - "epoch": 5.663414634146341, - "grad_norm": 3.5290887355804443, - "learning_rate": 1.9864682772723757e-06, - "loss": 0.0826, - "step": 1161 - }, - { - "epoch": 5.668292682926829, - "grad_norm": 2.0601298809051514, - "learning_rate": 1.9827193700625274e-06, - "loss": 0.0378, - "step": 1162 - }, - { - "epoch": 5.673170731707317, - "grad_norm": 3.8458635807037354, - "learning_rate": 1.978971677690081e-06, - "loss": 0.2466, - "step": 1163 - }, - { - "epoch": 5.678048780487805, - "grad_norm": 2.788210153579712, - "learning_rate": 1.97522520895652e-06, - "loss": 0.0205, - "step": 1164 - }, - { - "epoch": 5.682926829268292, - "grad_norm": 3.1904587745666504, - "learning_rate": 1.971479972660454e-06, - "loss": 0.0998, - "step": 1165 - }, - { - "epoch": 5.68780487804878, - "grad_norm": 2.4664318561553955, - "learning_rate": 1.967735977597598e-06, - "loss": 0.0217, - "step": 1166 - }, - { - "epoch": 5.692682926829268, - "grad_norm": 2.1392667293548584, - "learning_rate": 1.9639932325607538e-06, - "loss": 0.048, - "step": 1167 - }, - { - "epoch": 5.697560975609756, - "grad_norm": 3.7127058506011963, - "learning_rate": 1.9602517463397845e-06, - "loss": 0.0302, - "step": 1168 - }, - { - "epoch": 5.702439024390244, - "grad_norm": 2.916168689727783, - "learning_rate": 1.9565115277215978e-06, - "loss": 0.0724, - "step": 1169 - }, - { - "epoch": 5.7073170731707314, - "grad_norm": 2.4352428913116455, - "learning_rate": 1.952772585490127e-06, - "loss": 0.0464, - "step": 1170 - }, - { - "epoch": 5.712195121951219, - "grad_norm": 2.8311455249786377, - "learning_rate": 1.9490349284263036e-06, - "loss": 0.0239, - "step": 1171 - }, - { - "epoch": 5.717073170731707, - "grad_norm": 3.3592801094055176, - "learning_rate": 1.9452985653080443e-06, - "loss": 0.0719, - "step": 1172 - }, - { - "epoch": 5.721951219512195, - "grad_norm": 2.450922966003418, - "learning_rate": 1.9415635049102245e-06, - "loss": 0.0408, - "step": 1173 - }, - { - "epoch": 5.726829268292683, - "grad_norm": 4.750118255615234, - "learning_rate": 1.937829756004662e-06, - "loss": 0.2049, - "step": 1174 - }, - { - "epoch": 5.7317073170731705, - "grad_norm": 3.0643811225891113, - "learning_rate": 1.9340973273600944e-06, - "loss": 0.0636, - "step": 1175 - }, - { - "epoch": 5.736585365853658, - "grad_norm": 3.313904047012329, - "learning_rate": 1.930366227742157e-06, - "loss": 0.1252, - "step": 1176 - }, - { - "epoch": 5.741463414634146, - "grad_norm": 3.8996808528900146, - "learning_rate": 1.9266364659133653e-06, - "loss": 0.0687, - "step": 1177 - }, - { - "epoch": 5.746341463414634, - "grad_norm": 2.727555274963379, - "learning_rate": 1.922908050633093e-06, - "loss": 0.0333, - "step": 1178 - }, - { - "epoch": 5.751219512195122, - "grad_norm": 3.270087718963623, - "learning_rate": 1.919180990657551e-06, - "loss": 0.0792, - "step": 1179 - }, - { - "epoch": 5.7560975609756095, - "grad_norm": 2.6631274223327637, - "learning_rate": 1.9154552947397668e-06, - "loss": 0.069, - "step": 1180 - }, - { - "epoch": 5.760975609756097, - "grad_norm": 4.4460554122924805, - "learning_rate": 1.9117309716295658e-06, - "loss": 0.115, - "step": 1181 - }, - { - "epoch": 5.765853658536585, - "grad_norm": 2.5652341842651367, - "learning_rate": 1.9080080300735478e-06, - "loss": 0.0537, - "step": 1182 - }, - { - "epoch": 5.770731707317073, - "grad_norm": 3.046436071395874, - "learning_rate": 1.9042864788150695e-06, - "loss": 0.0817, - "step": 1183 - }, - { - "epoch": 5.775609756097561, - "grad_norm": 2.121629238128662, - "learning_rate": 1.9005663265942206e-06, - "loss": 0.0289, - "step": 1184 - }, - { - "epoch": 5.780487804878049, - "grad_norm": 2.271918535232544, - "learning_rate": 1.8968475821478066e-06, - "loss": 0.0357, - "step": 1185 - }, - { - "epoch": 5.785365853658536, - "grad_norm": 2.582473039627075, - "learning_rate": 1.8931302542093274e-06, - "loss": 0.0584, - "step": 1186 - }, - { - "epoch": 5.790243902439024, - "grad_norm": 2.502952814102173, - "learning_rate": 1.8894143515089539e-06, - "loss": 0.0324, - "step": 1187 - }, - { - "epoch": 5.795121951219512, - "grad_norm": 1.9735453128814697, - "learning_rate": 1.8856998827735118e-06, - "loss": 0.0338, - "step": 1188 - }, - { - "epoch": 5.8, - "grad_norm": 4.441845893859863, - "learning_rate": 1.8819868567264588e-06, - "loss": 0.1706, - "step": 1189 - }, - { - "epoch": 5.804878048780488, - "grad_norm": 2.5450692176818848, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.0463, - "step": 1190 - }, - { - "epoch": 5.809756097560975, - "grad_norm": 3.718183755874634, - "learning_rate": 1.8745651675743876e-06, - "loss": 0.1188, - "step": 1191 - }, - { - "epoch": 5.814634146341463, - "grad_norm": 3.246532678604126, - "learning_rate": 1.870856521899261e-06, - "loss": 0.0984, - "step": 1192 - }, - { - "epoch": 5.819512195121951, - "grad_norm": 2.9522783756256104, - "learning_rate": 1.867149353772267e-06, - "loss": 0.0195, - "step": 1193 - }, - { - "epoch": 5.824390243902439, - "grad_norm": 2.3266429901123047, - "learning_rate": 1.863443671899717e-06, - "loss": 0.0236, - "step": 1194 - }, - { - "epoch": 5.829268292682927, - "grad_norm": 3.696749448776245, - "learning_rate": 1.8597394849844319e-06, - "loss": 0.1108, - "step": 1195 - }, - { - "epoch": 5.8341463414634145, - "grad_norm": 2.375624179840088, - "learning_rate": 1.8560368017257229e-06, - "loss": 0.0388, - "step": 1196 - }, - { - "epoch": 5.839024390243902, - "grad_norm": 4.0437092781066895, - "learning_rate": 1.8523356308193696e-06, - "loss": 0.3098, - "step": 1197 - }, - { - "epoch": 5.84390243902439, - "grad_norm": 3.165165424346924, - "learning_rate": 1.8486359809575977e-06, - "loss": 0.0775, - "step": 1198 - }, - { - "epoch": 5.848780487804878, - "grad_norm": 4.1991190910339355, - "learning_rate": 1.8449378608290638e-06, - "loss": 0.1222, - "step": 1199 - }, - { - "epoch": 5.853658536585366, - "grad_norm": 4.6657819747924805, - "learning_rate": 1.8412412791188306e-06, - "loss": 0.1146, - "step": 1200 - }, - { - "epoch": 5.8585365853658535, - "grad_norm": 4.569516181945801, - "learning_rate": 1.8375462445083464e-06, - "loss": 0.1113, - "step": 1201 - }, - { - "epoch": 5.863414634146341, - "grad_norm": 3.1565654277801514, - "learning_rate": 1.8338527656754285e-06, - "loss": 0.0416, - "step": 1202 - }, - { - "epoch": 5.868292682926829, - "grad_norm": 3.3474619388580322, - "learning_rate": 1.830160851294239e-06, - "loss": 0.0613, - "step": 1203 - }, - { - "epoch": 5.873170731707317, - "grad_norm": 4.30797004699707, - "learning_rate": 1.8264705100352662e-06, - "loss": 0.197, - "step": 1204 - }, - { - "epoch": 5.878048780487805, - "grad_norm": 2.7259573936462402, - "learning_rate": 1.8227817505653045e-06, - "loss": 0.0821, - "step": 1205 - }, - { - "epoch": 5.882926829268293, - "grad_norm": 3.515812873840332, - "learning_rate": 1.8190945815474323e-06, - "loss": 0.1246, - "step": 1206 - }, - { - "epoch": 5.88780487804878, - "grad_norm": 2.9223313331604004, - "learning_rate": 1.8154090116409934e-06, - "loss": 0.0703, - "step": 1207 - }, - { - "epoch": 5.892682926829268, - "grad_norm": 3.9529640674591064, - "learning_rate": 1.811725049501577e-06, - "loss": 0.1078, - "step": 1208 - }, - { - "epoch": 5.897560975609756, - "grad_norm": 4.1674580574035645, - "learning_rate": 1.8080427037809941e-06, - "loss": 0.1648, - "step": 1209 - }, - { - "epoch": 5.902439024390244, - "grad_norm": 3.1308021545410156, - "learning_rate": 1.8043619831272623e-06, - "loss": 0.061, - "step": 1210 - }, - { - "epoch": 5.907317073170732, - "grad_norm": 3.9667179584503174, - "learning_rate": 1.8006828961845807e-06, - "loss": 0.1863, - "step": 1211 - }, - { - "epoch": 5.912195121951219, - "grad_norm": 5.438168048858643, - "learning_rate": 1.7970054515933124e-06, - "loss": 0.2387, - "step": 1212 - }, - { - "epoch": 5.917073170731707, - "grad_norm": 5.505797863006592, - "learning_rate": 1.793329657989964e-06, - "loss": 0.2053, - "step": 1213 - }, - { - "epoch": 5.921951219512195, - "grad_norm": 2.8043150901794434, - "learning_rate": 1.7896555240071627e-06, - "loss": 0.026, - "step": 1214 - }, - { - "epoch": 5.926829268292683, - "grad_norm": 2.836164712905884, - "learning_rate": 1.7859830582736406e-06, - "loss": 0.0735, - "step": 1215 - }, - { - "epoch": 5.931707317073171, - "grad_norm": 2.8286306858062744, - "learning_rate": 1.782312269414211e-06, - "loss": 0.0586, - "step": 1216 - }, - { - "epoch": 5.9365853658536585, - "grad_norm": 4.4354329109191895, - "learning_rate": 1.7786431660497474e-06, - "loss": 0.3086, - "step": 1217 - }, - { - "epoch": 5.941463414634146, - "grad_norm": 4.0963640213012695, - "learning_rate": 1.7749757567971678e-06, - "loss": 0.0978, - "step": 1218 - }, - { - "epoch": 5.946341463414634, - "grad_norm": 2.726062536239624, - "learning_rate": 1.7713100502694091e-06, - "loss": 0.0976, - "step": 1219 - }, - { - "epoch": 5.951219512195122, - "grad_norm": 2.6566951274871826, - "learning_rate": 1.7676460550754104e-06, - "loss": 0.02, - "step": 1220 - }, - { - "epoch": 5.95609756097561, - "grad_norm": 2.7710952758789062, - "learning_rate": 1.7639837798200923e-06, - "loss": 0.0741, - "step": 1221 - }, - { - "epoch": 5.9609756097560975, - "grad_norm": 2.3678600788116455, - "learning_rate": 1.7603232331043346e-06, - "loss": 0.0542, - "step": 1222 - }, - { - "epoch": 5.965853658536585, - "grad_norm": 6.45259428024292, - "learning_rate": 1.7566644235249591e-06, - "loss": 0.3552, - "step": 1223 - }, - { - "epoch": 5.970731707317073, - "grad_norm": 1.8916475772857666, - "learning_rate": 1.7530073596747072e-06, - "loss": 0.0405, - "step": 1224 - }, - { - "epoch": 5.975609756097561, - "grad_norm": 2.1637566089630127, - "learning_rate": 1.74935205014222e-06, - "loss": 0.0178, - "step": 1225 - }, - { - "epoch": 5.980487804878049, - "grad_norm": 2.5959200859069824, - "learning_rate": 1.7456985035120194e-06, - "loss": 0.0264, - "step": 1226 - }, - { - "epoch": 5.985365853658537, - "grad_norm": 2.50264573097229, - "learning_rate": 1.7420467283644877e-06, - "loss": 0.0555, - "step": 1227 - }, - { - "epoch": 5.990243902439024, - "grad_norm": 2.4692020416259766, - "learning_rate": 1.738396733275844e-06, - "loss": 0.0546, - "step": 1228 - }, - { - "epoch": 5.995121951219512, - "grad_norm": 5.540846824645996, - "learning_rate": 1.7347485268181309e-06, - "loss": 0.1967, - "step": 1229 - }, - { - "epoch": 6.0, - "grad_norm": 1.8322839736938477, - "learning_rate": 1.7311021175591868e-06, - "loss": 0.0491, - "step": 1230 - }, - { - "epoch": 6.004878048780488, - "grad_norm": 2.719622850418091, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.0359, - "step": 1231 - }, - { - "epoch": 6.009756097560976, - "grad_norm": 2.859675884246826, - "learning_rate": 1.7238147248878444e-06, - "loss": 0.0585, - "step": 1232 - }, - { - "epoch": 6.014634146341463, - "grad_norm": 1.6761114597320557, - "learning_rate": 1.7201737585899415e-06, - "loss": 0.0188, - "step": 1233 - }, - { - "epoch": 6.019512195121951, - "grad_norm": 2.1588776111602783, - "learning_rate": 1.7165346237197594e-06, - "loss": 0.0484, - "step": 1234 - }, - { - "epoch": 6.024390243902439, - "grad_norm": 4.209983825683594, - "learning_rate": 1.7128973288238344e-06, - "loss": 0.0776, - "step": 1235 - }, - { - "epoch": 6.029268292682927, - "grad_norm": 2.3979365825653076, - "learning_rate": 1.709261882444379e-06, - "loss": 0.0338, - "step": 1236 - }, - { - "epoch": 6.034146341463415, - "grad_norm": 3.0030531883239746, - "learning_rate": 1.705628293119268e-06, - "loss": 0.0385, - "step": 1237 - }, - { - "epoch": 6.0390243902439025, - "grad_norm": 9.65616512298584, - "learning_rate": 1.701996569382011e-06, - "loss": 0.2601, - "step": 1238 - }, - { - "epoch": 6.04390243902439, - "grad_norm": 3.0590052604675293, - "learning_rate": 1.6983667197617386e-06, - "loss": 0.034, - "step": 1239 - }, - { - "epoch": 6.048780487804878, - "grad_norm": 3.6949822902679443, - "learning_rate": 1.6947387527831813e-06, - "loss": 0.0155, - "step": 1240 - }, - { - "epoch": 6.053658536585366, - "grad_norm": 1.2870460748672485, - "learning_rate": 1.6911126769666442e-06, - "loss": 0.0078, - "step": 1241 - }, - { - "epoch": 6.058536585365854, - "grad_norm": 4.307460784912109, - "learning_rate": 1.6874885008279945e-06, - "loss": 0.1429, - "step": 1242 - }, - { - "epoch": 6.0634146341463415, - "grad_norm": 2.334972858428955, - "learning_rate": 1.683866232878637e-06, - "loss": 0.0123, - "step": 1243 - }, - { - "epoch": 6.068292682926829, - "grad_norm": 2.4121835231781006, - "learning_rate": 1.6802458816254941e-06, - "loss": 0.0139, - "step": 1244 - }, - { - "epoch": 6.073170731707317, - "grad_norm": 1.9224514961242676, - "learning_rate": 1.676627455570988e-06, - "loss": 0.0312, - "step": 1245 - }, - { - "epoch": 6.078048780487805, - "grad_norm": 2.8293309211730957, - "learning_rate": 1.6730109632130199e-06, - "loss": 0.0464, - "step": 1246 - }, - { - "epoch": 6.082926829268293, - "grad_norm": 1.6368179321289062, - "learning_rate": 1.6693964130449472e-06, - "loss": 0.0085, - "step": 1247 - }, - { - "epoch": 6.087804878048781, - "grad_norm": 2.5535073280334473, - "learning_rate": 1.6657838135555696e-06, - "loss": 0.0482, - "step": 1248 - }, - { - "epoch": 6.092682926829268, - "grad_norm": 3.7743096351623535, - "learning_rate": 1.6621731732291024e-06, - "loss": 0.0235, - "step": 1249 - }, - { - "epoch": 6.097560975609756, - "grad_norm": 2.9921820163726807, - "learning_rate": 1.6585645005451623e-06, - "loss": 0.0455, - "step": 1250 - }, - { - "epoch": 6.102439024390244, - "grad_norm": 2.369581937789917, - "learning_rate": 1.6549578039787436e-06, - "loss": 0.0499, - "step": 1251 - }, - { - "epoch": 6.107317073170732, - "grad_norm": 2.163815498352051, - "learning_rate": 1.6513530920001998e-06, - "loss": 0.0118, - "step": 1252 - }, - { - "epoch": 6.11219512195122, - "grad_norm": 2.034928560256958, - "learning_rate": 1.6477503730752237e-06, - "loss": 0.0189, - "step": 1253 - }, - { - "epoch": 6.117073170731707, - "grad_norm": 2.7306160926818848, - "learning_rate": 1.6441496556648278e-06, - "loss": 0.0492, - "step": 1254 - }, - { - "epoch": 6.121951219512195, - "grad_norm": 3.7521040439605713, - "learning_rate": 1.6405509482253234e-06, - "loss": 0.1717, - "step": 1255 - }, - { - "epoch": 6.126829268292683, - "grad_norm": 1.8965831995010376, - "learning_rate": 1.636954259208302e-06, - "loss": 0.0194, - "step": 1256 - }, - { - "epoch": 6.131707317073171, - "grad_norm": 3.010024070739746, - "learning_rate": 1.6333595970606143e-06, - "loss": 0.0334, - "step": 1257 - }, - { - "epoch": 6.136585365853659, - "grad_norm": 3.7091450691223145, - "learning_rate": 1.62976697022435e-06, - "loss": 0.0705, - "step": 1258 - }, - { - "epoch": 6.1414634146341465, - "grad_norm": 3.5719785690307617, - "learning_rate": 1.6261763871368225e-06, - "loss": 0.0322, - "step": 1259 - }, - { - "epoch": 6.146341463414634, - "grad_norm": 3.3224213123321533, - "learning_rate": 1.6225878562305403e-06, - "loss": 0.0653, - "step": 1260 - }, - { - "epoch": 6.151219512195122, - "grad_norm": 3.78924822807312, - "learning_rate": 1.6190013859331958e-06, - "loss": 0.0557, - "step": 1261 - }, - { - "epoch": 6.15609756097561, - "grad_norm": 2.429412841796875, - "learning_rate": 1.6154169846676415e-06, - "loss": 0.0277, - "step": 1262 - }, - { - "epoch": 6.160975609756098, - "grad_norm": 2.626167058944702, - "learning_rate": 1.6118346608518698e-06, - "loss": 0.0305, - "step": 1263 - }, - { - "epoch": 6.1658536585365855, - "grad_norm": 2.44846248626709, - "learning_rate": 1.6082544228989958e-06, - "loss": 0.0093, - "step": 1264 - }, - { - "epoch": 6.170731707317073, - "grad_norm": 2.9345643520355225, - "learning_rate": 1.6046762792172336e-06, - "loss": 0.0198, - "step": 1265 - }, - { - "epoch": 6.175609756097561, - "grad_norm": 3.224313497543335, - "learning_rate": 1.6011002382098806e-06, - "loss": 0.0673, - "step": 1266 - }, - { - "epoch": 6.180487804878049, - "grad_norm": 1.9066869020462036, - "learning_rate": 1.5975263082752968e-06, - "loss": 0.0115, - "step": 1267 - }, - { - "epoch": 6.185365853658537, - "grad_norm": 2.7153308391571045, - "learning_rate": 1.5939544978068816e-06, - "loss": 0.0529, - "step": 1268 - }, - { - "epoch": 6.190243902439025, - "grad_norm": 2.2173709869384766, - "learning_rate": 1.590384815193059e-06, - "loss": 0.0643, - "step": 1269 - }, - { - "epoch": 6.195121951219512, - "grad_norm": 3.1238555908203125, - "learning_rate": 1.5868172688172559e-06, - "loss": 0.064, - "step": 1270 - }, - { - "epoch": 6.2, - "grad_norm": 2.7765870094299316, - "learning_rate": 1.5832518670578802e-06, - "loss": 0.0676, - "step": 1271 - }, - { - "epoch": 6.204878048780488, - "grad_norm": 2.9892525672912598, - "learning_rate": 1.5796886182883053e-06, - "loss": 0.074, - "step": 1272 - }, - { - "epoch": 6.209756097560976, - "grad_norm": 2.0955512523651123, - "learning_rate": 1.5761275308768476e-06, - "loss": 0.0311, - "step": 1273 - }, - { - "epoch": 6.214634146341464, - "grad_norm": 1.8085861206054688, - "learning_rate": 1.5725686131867462e-06, - "loss": 0.0108, - "step": 1274 - }, - { - "epoch": 6.219512195121951, - "grad_norm": 3.026421308517456, - "learning_rate": 1.569011873576147e-06, - "loss": 0.0464, - "step": 1275 - }, - { - "epoch": 6.224390243902439, - "grad_norm": 2.3395111560821533, - "learning_rate": 1.5654573203980782e-06, - "loss": 0.0221, - "step": 1276 - }, - { - "epoch": 6.229268292682927, - "grad_norm": 3.6158692836761475, - "learning_rate": 1.5619049620004354e-06, - "loss": 0.0693, - "step": 1277 - }, - { - "epoch": 6.234146341463415, - "grad_norm": 1.6186567544937134, - "learning_rate": 1.5583548067259584e-06, - "loss": 0.0198, - "step": 1278 - }, - { - "epoch": 6.239024390243903, - "grad_norm": 2.7193195819854736, - "learning_rate": 1.5548068629122126e-06, - "loss": 0.0687, - "step": 1279 - }, - { - "epoch": 6.2439024390243905, - "grad_norm": 2.7472658157348633, - "learning_rate": 1.5512611388915711e-06, - "loss": 0.053, - "step": 1280 - }, - { - "epoch": 6.248780487804878, - "grad_norm": 4.694706439971924, - "learning_rate": 1.5477176429911934e-06, - "loss": 0.2076, - "step": 1281 - }, - { - "epoch": 6.253658536585366, - "grad_norm": 1.609309434890747, - "learning_rate": 1.5441763835330048e-06, - "loss": 0.0108, - "step": 1282 - }, - { - "epoch": 6.258536585365854, - "grad_norm": 1.7064504623413086, - "learning_rate": 1.5406373688336807e-06, - "loss": 0.0114, - "step": 1283 - }, - { - "epoch": 6.263414634146342, - "grad_norm": 1.967726469039917, - "learning_rate": 1.5371006072046225e-06, - "loss": 0.0209, - "step": 1284 - }, - { - "epoch": 6.2682926829268295, - "grad_norm": 2.4065544605255127, - "learning_rate": 1.5335661069519408e-06, - "loss": 0.0741, - "step": 1285 - }, - { - "epoch": 6.273170731707317, - "grad_norm": 2.2167603969573975, - "learning_rate": 1.5300338763764371e-06, - "loss": 0.0121, - "step": 1286 - }, - { - "epoch": 6.278048780487805, - "grad_norm": 3.229228973388672, - "learning_rate": 1.5265039237735804e-06, - "loss": 0.0226, - "step": 1287 - }, - { - "epoch": 6.282926829268293, - "grad_norm": 1.889419674873352, - "learning_rate": 1.5229762574334903e-06, - "loss": 0.0116, - "step": 1288 - }, - { - "epoch": 6.287804878048781, - "grad_norm": 3.7595815658569336, - "learning_rate": 1.5194508856409181e-06, - "loss": 0.0775, - "step": 1289 - }, - { - "epoch": 6.2926829268292686, - "grad_norm": 2.527560234069824, - "learning_rate": 1.515927816675225e-06, - "loss": 0.0355, - "step": 1290 - }, - { - "epoch": 6.297560975609756, - "grad_norm": 1.9718955755233765, - "learning_rate": 1.5124070588103648e-06, - "loss": 0.0127, - "step": 1291 - }, - { - "epoch": 6.302439024390244, - "grad_norm": 1.9010120630264282, - "learning_rate": 1.5088886203148643e-06, - "loss": 0.0188, - "step": 1292 - }, - { - "epoch": 6.307317073170732, - "grad_norm": 3.2093472480773926, - "learning_rate": 1.505372509451801e-06, - "loss": 0.0845, - "step": 1293 - }, - { - "epoch": 6.31219512195122, - "grad_norm": 1.6723257303237915, - "learning_rate": 1.5018587344787888e-06, - "loss": 0.0265, - "step": 1294 - }, - { - "epoch": 6.317073170731708, - "grad_norm": 3.246812343597412, - "learning_rate": 1.498347303647953e-06, - "loss": 0.0833, - "step": 1295 - }, - { - "epoch": 6.321951219512195, - "grad_norm": 2.887834072113037, - "learning_rate": 1.4948382252059158e-06, - "loss": 0.0416, - "step": 1296 - }, - { - "epoch": 6.326829268292683, - "grad_norm": 2.5762557983398438, - "learning_rate": 1.4913315073937742e-06, - "loss": 0.0614, - "step": 1297 - }, - { - "epoch": 6.331707317073171, - "grad_norm": 3.3746497631073, - "learning_rate": 1.4878271584470805e-06, - "loss": 0.0601, - "step": 1298 - }, - { - "epoch": 6.336585365853659, - "grad_norm": 2.4984664916992188, - "learning_rate": 1.4843251865958242e-06, - "loss": 0.0189, - "step": 1299 - }, - { - "epoch": 6.341463414634147, - "grad_norm": 3.178300619125366, - "learning_rate": 1.4808256000644128e-06, - "loss": 0.038, - "step": 1300 - }, - { - "epoch": 6.3463414634146345, - "grad_norm": 2.6362273693084717, - "learning_rate": 1.4773284070716504e-06, - "loss": 0.041, - "step": 1301 - }, - { - "epoch": 6.351219512195122, - "grad_norm": 2.1512129306793213, - "learning_rate": 1.473833615830722e-06, - "loss": 0.0227, - "step": 1302 - }, - { - "epoch": 6.35609756097561, - "grad_norm": 2.2898178100585938, - "learning_rate": 1.4703412345491692e-06, - "loss": 0.039, - "step": 1303 - }, - { - "epoch": 6.360975609756098, - "grad_norm": 2.6641080379486084, - "learning_rate": 1.4668512714288763e-06, - "loss": 0.0431, - "step": 1304 - }, - { - "epoch": 6.365853658536586, - "grad_norm": 1.7466667890548706, - "learning_rate": 1.4633637346660478e-06, - "loss": 0.013, - "step": 1305 - }, - { - "epoch": 6.3707317073170735, - "grad_norm": 2.437889575958252, - "learning_rate": 1.4598786324511892e-06, - "loss": 0.0181, - "step": 1306 - }, - { - "epoch": 6.375609756097561, - "grad_norm": 2.5054142475128174, - "learning_rate": 1.456395972969089e-06, - "loss": 0.0248, - "step": 1307 - }, - { - "epoch": 6.380487804878049, - "grad_norm": 3.2294511795043945, - "learning_rate": 1.4529157643987995e-06, - "loss": 0.0561, - "step": 1308 - }, - { - "epoch": 6.385365853658537, - "grad_norm": 2.260188341140747, - "learning_rate": 1.4494380149136162e-06, - "loss": 0.0593, - "step": 1309 - }, - { - "epoch": 6.390243902439025, - "grad_norm": 2.4961163997650146, - "learning_rate": 1.4459627326810576e-06, - "loss": 0.0257, - "step": 1310 - }, - { - "epoch": 6.3951219512195125, - "grad_norm": 3.4153239727020264, - "learning_rate": 1.4424899258628533e-06, - "loss": 0.0223, - "step": 1311 - }, - { - "epoch": 6.4, - "grad_norm": 2.6308839321136475, - "learning_rate": 1.439019602614914e-06, - "loss": 0.0112, - "step": 1312 - }, - { - "epoch": 6.404878048780488, - "grad_norm": 2.754530191421509, - "learning_rate": 1.4355517710873184e-06, - "loss": 0.068, - "step": 1313 - }, - { - "epoch": 6.409756097560976, - "grad_norm": 4.473151683807373, - "learning_rate": 1.432086439424297e-06, - "loss": 0.0825, - "step": 1314 - }, - { - "epoch": 6.414634146341464, - "grad_norm": 4.85701322555542, - "learning_rate": 1.428623615764206e-06, - "loss": 0.1812, - "step": 1315 - }, - { - "epoch": 6.419512195121952, - "grad_norm": 1.6678224802017212, - "learning_rate": 1.4251633082395117e-06, - "loss": 0.0207, - "step": 1316 - }, - { - "epoch": 6.424390243902439, - "grad_norm": 2.9730937480926514, - "learning_rate": 1.4217055249767734e-06, - "loss": 0.0617, - "step": 1317 - }, - { - "epoch": 6.429268292682927, - "grad_norm": 2.503786563873291, - "learning_rate": 1.4182502740966203e-06, - "loss": 0.0137, - "step": 1318 - }, - { - "epoch": 6.434146341463415, - "grad_norm": 3.0798017978668213, - "learning_rate": 1.4147975637137334e-06, - "loss": 0.0329, - "step": 1319 - }, - { - "epoch": 6.439024390243903, - "grad_norm": 3.008155345916748, - "learning_rate": 1.411347401936831e-06, - "loss": 0.0487, - "step": 1320 - }, - { - "epoch": 6.443902439024391, - "grad_norm": 2.5451765060424805, - "learning_rate": 1.4078997968686425e-06, - "loss": 0.0582, - "step": 1321 - }, - { - "epoch": 6.4487804878048784, - "grad_norm": 2.042696475982666, - "learning_rate": 1.404454756605893e-06, - "loss": 0.0336, - "step": 1322 - }, - { - "epoch": 6.453658536585366, - "grad_norm": 3.0421411991119385, - "learning_rate": 1.4010122892392872e-06, - "loss": 0.1372, - "step": 1323 - }, - { - "epoch": 6.458536585365854, - "grad_norm": 2.0793251991271973, - "learning_rate": 1.3975724028534842e-06, - "loss": 0.0452, - "step": 1324 - }, - { - "epoch": 6.463414634146342, - "grad_norm": 2.6149914264678955, - "learning_rate": 1.394135105527083e-06, - "loss": 0.0431, - "step": 1325 - }, - { - "epoch": 6.46829268292683, - "grad_norm": 2.818507671356201, - "learning_rate": 1.3907004053326006e-06, - "loss": 0.0242, - "step": 1326 - }, - { - "epoch": 6.473170731707317, - "grad_norm": 2.328993558883667, - "learning_rate": 1.387268310336458e-06, - "loss": 0.0293, - "step": 1327 - }, - { - "epoch": 6.478048780487805, - "grad_norm": 2.2032642364501953, - "learning_rate": 1.3838388285989552e-06, - "loss": 0.0232, - "step": 1328 - }, - { - "epoch": 6.482926829268292, - "grad_norm": 2.039983034133911, - "learning_rate": 1.380411968174254e-06, - "loss": 0.0256, - "step": 1329 - }, - { - "epoch": 6.487804878048781, - "grad_norm": 3.7261271476745605, - "learning_rate": 1.3769877371103635e-06, - "loss": 0.1285, - "step": 1330 - }, - { - "epoch": 6.492682926829268, - "grad_norm": 3.7156264781951904, - "learning_rate": 1.373566143449115e-06, - "loss": 0.1621, - "step": 1331 - }, - { - "epoch": 6.4975609756097565, - "grad_norm": 1.5905455350875854, - "learning_rate": 1.3701471952261457e-06, - "loss": 0.0126, - "step": 1332 - }, - { - "epoch": 6.5024390243902435, - "grad_norm": 2.8808465003967285, - "learning_rate": 1.3667309004708832e-06, - "loss": 0.0211, - "step": 1333 - }, - { - "epoch": 6.507317073170732, - "grad_norm": 3.9190757274627686, - "learning_rate": 1.3633172672065195e-06, - "loss": 0.062, - "step": 1334 - }, - { - "epoch": 6.512195121951219, - "grad_norm": 1.6948635578155518, - "learning_rate": 1.359906303449997e-06, - "loss": 0.0126, - "step": 1335 - }, - { - "epoch": 6.517073170731708, - "grad_norm": 2.3967642784118652, - "learning_rate": 1.3564980172119913e-06, - "loss": 0.0111, - "step": 1336 - }, - { - "epoch": 6.521951219512195, - "grad_norm": 3.5275399684906006, - "learning_rate": 1.3530924164968873e-06, - "loss": 0.1024, - "step": 1337 - }, - { - "epoch": 6.526829268292683, - "grad_norm": 2.0768814086914062, - "learning_rate": 1.3496895093027617e-06, - "loss": 0.0254, - "step": 1338 - }, - { - "epoch": 6.53170731707317, - "grad_norm": 1.8964029550552368, - "learning_rate": 1.3462893036213706e-06, - "loss": 0.0188, - "step": 1339 - }, - { - "epoch": 6.536585365853659, - "grad_norm": 1.679545283317566, - "learning_rate": 1.3428918074381203e-06, - "loss": 0.0195, - "step": 1340 - }, - { - "epoch": 6.541463414634146, - "grad_norm": 2.204637050628662, - "learning_rate": 1.3394970287320553e-06, - "loss": 0.0317, - "step": 1341 - }, - { - "epoch": 6.546341463414635, - "grad_norm": 2.014052629470825, - "learning_rate": 1.3361049754758404e-06, - "loss": 0.0191, - "step": 1342 - }, - { - "epoch": 6.5512195121951216, - "grad_norm": 1.4630589485168457, - "learning_rate": 1.3327156556357369e-06, - "loss": 0.0079, - "step": 1343 - }, - { - "epoch": 6.55609756097561, - "grad_norm": 2.876132011413574, - "learning_rate": 1.3293290771715875e-06, - "loss": 0.0345, - "step": 1344 - }, - { - "epoch": 6.560975609756097, - "grad_norm": 1.793338656425476, - "learning_rate": 1.3259452480367963e-06, - "loss": 0.0409, - "step": 1345 - }, - { - "epoch": 6.565853658536585, - "grad_norm": 2.2791552543640137, - "learning_rate": 1.3225641761783126e-06, - "loss": 0.0494, - "step": 1346 - }, - { - "epoch": 6.570731707317073, - "grad_norm": 4.255206108093262, - "learning_rate": 1.3191858695366084e-06, - "loss": 0.0842, - "step": 1347 - }, - { - "epoch": 6.575609756097561, - "grad_norm": 2.449460506439209, - "learning_rate": 1.3158103360456603e-06, - "loss": 0.0399, - "step": 1348 - }, - { - "epoch": 6.580487804878048, - "grad_norm": 2.780730724334717, - "learning_rate": 1.3124375836329362e-06, - "loss": 0.0272, - "step": 1349 - }, - { - "epoch": 6.585365853658536, - "grad_norm": 1.925681233406067, - "learning_rate": 1.3090676202193692e-06, - "loss": 0.007, - "step": 1350 - }, - { - "epoch": 6.590243902439024, - "grad_norm": 2.069791555404663, - "learning_rate": 1.3057004537193424e-06, - "loss": 0.016, - "step": 1351 - }, - { - "epoch": 6.595121951219512, - "grad_norm": 1.863872766494751, - "learning_rate": 1.302336092040673e-06, - "loss": 0.016, - "step": 1352 - }, - { - "epoch": 6.6, - "grad_norm": 2.351259231567383, - "learning_rate": 1.298974543084589e-06, - "loss": 0.0172, - "step": 1353 - }, - { - "epoch": 6.6048780487804875, - "grad_norm": 1.848115086555481, - "learning_rate": 1.2956158147457116e-06, - "loss": 0.0412, - "step": 1354 - }, - { - "epoch": 6.609756097560975, - "grad_norm": 1.6395928859710693, - "learning_rate": 1.2922599149120412e-06, - "loss": 0.0181, - "step": 1355 - }, - { - "epoch": 6.614634146341463, - "grad_norm": 2.1267426013946533, - "learning_rate": 1.2889068514649328e-06, - "loss": 0.04, - "step": 1356 - }, - { - "epoch": 6.619512195121951, - "grad_norm": 1.6603496074676514, - "learning_rate": 1.2855566322790796e-06, - "loss": 0.0108, - "step": 1357 - }, - { - "epoch": 6.624390243902439, - "grad_norm": 2.2724838256835938, - "learning_rate": 1.2822092652224989e-06, - "loss": 0.0284, - "step": 1358 - }, - { - "epoch": 6.6292682926829265, - "grad_norm": 2.222623825073242, - "learning_rate": 1.2788647581565048e-06, - "loss": 0.0128, - "step": 1359 - }, - { - "epoch": 6.634146341463414, - "grad_norm": 2.710681676864624, - "learning_rate": 1.275523118935697e-06, - "loss": 0.0184, - "step": 1360 - }, - { - "epoch": 6.639024390243902, - "grad_norm": 2.354264736175537, - "learning_rate": 1.2721843554079418e-06, - "loss": 0.0313, - "step": 1361 - }, - { - "epoch": 6.64390243902439, - "grad_norm": 3.886909008026123, - "learning_rate": 1.2688484754143493e-06, - "loss": 0.1184, - "step": 1362 - }, - { - "epoch": 6.648780487804878, - "grad_norm": 3.088468313217163, - "learning_rate": 1.2655154867892577e-06, - "loss": 0.0353, - "step": 1363 - }, - { - "epoch": 6.6536585365853655, - "grad_norm": 2.987576484680176, - "learning_rate": 1.2621853973602158e-06, - "loss": 0.0349, - "step": 1364 - }, - { - "epoch": 6.658536585365853, - "grad_norm": 1.719212293624878, - "learning_rate": 1.2588582149479645e-06, - "loss": 0.0081, - "step": 1365 - }, - { - "epoch": 6.663414634146341, - "grad_norm": 2.1641178131103516, - "learning_rate": 1.2555339473664151e-06, - "loss": 0.0279, - "step": 1366 - }, - { - "epoch": 6.668292682926829, - "grad_norm": 2.9424984455108643, - "learning_rate": 1.2522126024226347e-06, - "loss": 0.0492, - "step": 1367 - }, - { - "epoch": 6.673170731707317, - "grad_norm": 1.961077332496643, - "learning_rate": 1.2488941879168278e-06, - "loss": 0.0084, - "step": 1368 - }, - { - "epoch": 6.678048780487805, - "grad_norm": 2.302565097808838, - "learning_rate": 1.2455787116423148e-06, - "loss": 0.0486, - "step": 1369 - }, - { - "epoch": 6.682926829268292, - "grad_norm": 2.187194347381592, - "learning_rate": 1.2422661813855158e-06, - "loss": 0.0319, - "step": 1370 - }, - { - "epoch": 6.68780487804878, - "grad_norm": 2.0076377391815186, - "learning_rate": 1.238956604925934e-06, - "loss": 0.016, - "step": 1371 - }, - { - "epoch": 6.692682926829268, - "grad_norm": 4.137681484222412, - "learning_rate": 1.2356499900361333e-06, - "loss": 0.0557, - "step": 1372 - }, - { - "epoch": 6.697560975609756, - "grad_norm": 2.0039637088775635, - "learning_rate": 1.2323463444817227e-06, - "loss": 0.0219, - "step": 1373 - }, - { - "epoch": 6.702439024390244, - "grad_norm": 2.943314552307129, - "learning_rate": 1.2290456760213405e-06, - "loss": 0.0849, - "step": 1374 - }, - { - "epoch": 6.7073170731707314, - "grad_norm": 2.715120553970337, - "learning_rate": 1.2257479924066296e-06, - "loss": 0.0857, - "step": 1375 - }, - { - "epoch": 6.712195121951219, - "grad_norm": 3.144104480743408, - "learning_rate": 1.2224533013822237e-06, - "loss": 0.0648, - "step": 1376 - }, - { - "epoch": 6.717073170731707, - "grad_norm": 2.830066680908203, - "learning_rate": 1.2191616106857312e-06, - "loss": 0.0426, - "step": 1377 - }, - { - "epoch": 6.721951219512195, - "grad_norm": 3.1005899906158447, - "learning_rate": 1.2158729280477112e-06, - "loss": 0.0478, - "step": 1378 - }, - { - "epoch": 6.726829268292683, - "grad_norm": 2.2102460861206055, - "learning_rate": 1.2125872611916578e-06, - "loss": 0.0273, - "step": 1379 - }, - { - "epoch": 6.7317073170731705, - "grad_norm": 2.860288619995117, - "learning_rate": 1.2093046178339869e-06, - "loss": 0.0201, - "step": 1380 - }, - { - "epoch": 6.736585365853658, - "grad_norm": 1.5914067029953003, - "learning_rate": 1.206025005684009e-06, - "loss": 0.0148, - "step": 1381 - }, - { - "epoch": 6.741463414634146, - "grad_norm": 1.8609223365783691, - "learning_rate": 1.202748432443918e-06, - "loss": 0.0073, - "step": 1382 - }, - { - "epoch": 6.746341463414634, - "grad_norm": 3.0532407760620117, - "learning_rate": 1.1994749058087695e-06, - "loss": 0.0344, - "step": 1383 - }, - { - "epoch": 6.751219512195122, - "grad_norm": 4.0601677894592285, - "learning_rate": 1.196204433466467e-06, - "loss": 0.0837, - "step": 1384 - }, - { - "epoch": 6.7560975609756095, - "grad_norm": 2.6982672214508057, - "learning_rate": 1.192937023097738e-06, - "loss": 0.0425, - "step": 1385 - }, - { - "epoch": 6.760975609756097, - "grad_norm": 1.431360125541687, - "learning_rate": 1.1896726823761195e-06, - "loss": 0.0065, - "step": 1386 - }, - { - "epoch": 6.765853658536585, - "grad_norm": 2.116907835006714, - "learning_rate": 1.1864114189679413e-06, - "loss": 0.0133, - "step": 1387 - }, - { - "epoch": 6.770731707317073, - "grad_norm": 2.6869874000549316, - "learning_rate": 1.183153240532304e-06, - "loss": 0.0188, - "step": 1388 - }, - { - "epoch": 6.775609756097561, - "grad_norm": 2.0294089317321777, - "learning_rate": 1.179898154721063e-06, - "loss": 0.0234, - "step": 1389 - }, - { - "epoch": 6.780487804878049, - "grad_norm": 2.3081958293914795, - "learning_rate": 1.1766461691788137e-06, - "loss": 0.0208, - "step": 1390 - }, - { - "epoch": 6.785365853658536, - "grad_norm": 3.4795000553131104, - "learning_rate": 1.1733972915428665e-06, - "loss": 0.0728, - "step": 1391 - }, - { - "epoch": 6.790243902439024, - "grad_norm": 2.5121219158172607, - "learning_rate": 1.1701515294432348e-06, - "loss": 0.0291, - "step": 1392 - }, - { - "epoch": 6.795121951219512, - "grad_norm": 5.1100172996521, - "learning_rate": 1.1669088905026156e-06, - "loss": 0.0988, - "step": 1393 - }, - { - "epoch": 6.8, - "grad_norm": 2.5434396266937256, - "learning_rate": 1.163669382336371e-06, - "loss": 0.0399, - "step": 1394 - }, - { - "epoch": 6.804878048780488, - "grad_norm": 2.7811660766601562, - "learning_rate": 1.160433012552508e-06, - "loss": 0.0134, - "step": 1395 - }, - { - "epoch": 6.809756097560975, - "grad_norm": 3.2409870624542236, - "learning_rate": 1.1571997887516672e-06, - "loss": 0.0795, - "step": 1396 - }, - { - "epoch": 6.814634146341463, - "grad_norm": 2.5300986766815186, - "learning_rate": 1.1539697185270982e-06, - "loss": 0.0329, - "step": 1397 - }, - { - "epoch": 6.819512195121951, - "grad_norm": 1.8510549068450928, - "learning_rate": 1.1507428094646448e-06, - "loss": 0.0213, - "step": 1398 - }, - { - "epoch": 6.824390243902439, - "grad_norm": 1.8820618391036987, - "learning_rate": 1.1475190691427255e-06, - "loss": 0.0172, - "step": 1399 - }, - { - "epoch": 6.829268292682927, - "grad_norm": 1.3415460586547852, - "learning_rate": 1.1442985051323205e-06, - "loss": 0.0029, - "step": 1400 - }, - { - "epoch": 6.8341463414634145, - "grad_norm": 6.033786296844482, - "learning_rate": 1.1410811249969475e-06, - "loss": 0.1638, - "step": 1401 - }, - { - "epoch": 6.839024390243902, - "grad_norm": 2.990328311920166, - "learning_rate": 1.1378669362926468e-06, - "loss": 0.0779, - "step": 1402 - }, - { - "epoch": 6.84390243902439, - "grad_norm": 3.2766308784484863, - "learning_rate": 1.1346559465679656e-06, - "loss": 0.0528, - "step": 1403 - }, - { - "epoch": 6.848780487804878, - "grad_norm": 1.266032338142395, - "learning_rate": 1.1314481633639374e-06, - "loss": 0.0057, - "step": 1404 - }, - { - "epoch": 6.853658536585366, - "grad_norm": 3.1048431396484375, - "learning_rate": 1.1282435942140632e-06, - "loss": 0.1772, - "step": 1405 - }, - { - "epoch": 6.8585365853658535, - "grad_norm": 2.264822483062744, - "learning_rate": 1.1250422466442992e-06, - "loss": 0.0176, - "step": 1406 - }, - { - "epoch": 6.863414634146341, - "grad_norm": 2.0890846252441406, - "learning_rate": 1.1218441281730334e-06, - "loss": 0.0184, - "step": 1407 - }, - { - "epoch": 6.868292682926829, - "grad_norm": 1.8351202011108398, - "learning_rate": 1.1186492463110696e-06, - "loss": 0.0127, - "step": 1408 - }, - { - "epoch": 6.873170731707317, - "grad_norm": 1.447196125984192, - "learning_rate": 1.1154576085616135e-06, - "loss": 0.0094, - "step": 1409 - }, - { - "epoch": 6.878048780487805, - "grad_norm": 1.6414039134979248, - "learning_rate": 1.1122692224202491e-06, - "loss": 0.0138, - "step": 1410 - }, - { - "epoch": 6.882926829268293, - "grad_norm": 2.87068772315979, - "learning_rate": 1.1090840953749253e-06, - "loss": 0.0821, - "step": 1411 - }, - { - "epoch": 6.88780487804878, - "grad_norm": 2.0476415157318115, - "learning_rate": 1.1059022349059362e-06, - "loss": 0.0222, - "step": 1412 - }, - { - "epoch": 6.892682926829268, - "grad_norm": 4.169386863708496, - "learning_rate": 1.102723648485905e-06, - "loss": 0.1183, - "step": 1413 - }, - { - "epoch": 6.897560975609756, - "grad_norm": 4.47883415222168, - "learning_rate": 1.0995483435797643e-06, - "loss": 0.0528, - "step": 1414 - }, - { - "epoch": 6.902439024390244, - "grad_norm": 2.0025508403778076, - "learning_rate": 1.0963763276447435e-06, - "loss": 0.0106, - "step": 1415 - }, - { - "epoch": 6.907317073170732, - "grad_norm": 2.4212136268615723, - "learning_rate": 1.0932076081303442e-06, - "loss": 0.0454, - "step": 1416 - }, - { - "epoch": 6.912195121951219, - "grad_norm": 1.7873961925506592, - "learning_rate": 1.0900421924783272e-06, - "loss": 0.022, - "step": 1417 - }, - { - "epoch": 6.917073170731707, - "grad_norm": 2.0345218181610107, - "learning_rate": 1.0868800881226962e-06, - "loss": 0.0261, - "step": 1418 - }, - { - "epoch": 6.921951219512195, - "grad_norm": 3.086538314819336, - "learning_rate": 1.0837213024896764e-06, - "loss": 0.0257, - "step": 1419 - }, - { - "epoch": 6.926829268292683, - "grad_norm": 2.9401397705078125, - "learning_rate": 1.080565842997698e-06, - "loss": 0.087, - "step": 1420 - }, - { - "epoch": 6.931707317073171, - "grad_norm": 1.305415153503418, - "learning_rate": 1.0774137170573826e-06, - "loss": 0.0147, - "step": 1421 - }, - { - "epoch": 6.9365853658536585, - "grad_norm": 3.0256683826446533, - "learning_rate": 1.074264932071521e-06, - "loss": 0.1183, - "step": 1422 - }, - { - "epoch": 6.941463414634146, - "grad_norm": 2.3618743419647217, - "learning_rate": 1.0711194954350568e-06, - "loss": 0.0186, - "step": 1423 - }, - { - "epoch": 6.946341463414634, - "grad_norm": 2.004451036453247, - "learning_rate": 1.0679774145350735e-06, - "loss": 0.0222, - "step": 1424 - }, - { - "epoch": 6.951219512195122, - "grad_norm": 3.089723587036133, - "learning_rate": 1.0648386967507703e-06, - "loss": 0.0824, - "step": 1425 - }, - { - "epoch": 6.95609756097561, - "grad_norm": 1.9310235977172852, - "learning_rate": 1.0617033494534486e-06, - "loss": 0.0247, - "step": 1426 - }, - { - "epoch": 6.9609756097560975, - "grad_norm": 1.973836898803711, - "learning_rate": 1.0585713800064964e-06, - "loss": 0.0142, - "step": 1427 - }, - { - "epoch": 6.965853658536585, - "grad_norm": 2.9914112091064453, - "learning_rate": 1.0554427957653663e-06, - "loss": 0.0681, - "step": 1428 - }, - { - "epoch": 6.970731707317073, - "grad_norm": 3.356689691543579, - "learning_rate": 1.0523176040775615e-06, - "loss": 0.0916, - "step": 1429 - }, - { - "epoch": 6.975609756097561, - "grad_norm": 2.3305246829986572, - "learning_rate": 1.0491958122826173e-06, - "loss": 0.0611, - "step": 1430 - }, - { - "epoch": 6.980487804878049, - "grad_norm": 1.7383835315704346, - "learning_rate": 1.0460774277120866e-06, - "loss": 0.0182, - "step": 1431 - }, - { - "epoch": 6.985365853658537, - "grad_norm": 2.585674524307251, - "learning_rate": 1.0429624576895177e-06, - "loss": 0.0084, - "step": 1432 - }, - { - "epoch": 6.990243902439024, - "grad_norm": 3.023864269256592, - "learning_rate": 1.03985090953044e-06, - "loss": 0.0411, - "step": 1433 - }, - { - "epoch": 6.995121951219512, - "grad_norm": 2.281674861907959, - "learning_rate": 1.0367427905423497e-06, - "loss": 0.0464, - "step": 1434 - }, - { - "epoch": 7.0, - "grad_norm": 1.4372339248657227, - "learning_rate": 1.0336381080246858e-06, - "loss": 0.0124, - "step": 1435 - }, - { - "epoch": 7.004878048780488, - "grad_norm": 1.9526969194412231, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.0179, - "step": 1436 - }, - { - "epoch": 7.009756097560976, - "grad_norm": 1.7297903299331665, - "learning_rate": 1.027439081558029e-06, - "loss": 0.0119, - "step": 1437 - }, - { - "epoch": 7.014634146341463, - "grad_norm": 2.2754275798797607, - "learning_rate": 1.0243447521674967e-06, - "loss": 0.0278, - "step": 1438 - }, - { - "epoch": 7.019512195121951, - "grad_norm": 5.485769271850586, - "learning_rate": 1.021253888364276e-06, - "loss": 0.1259, - "step": 1439 - }, - { - "epoch": 7.024390243902439, - "grad_norm": 0.9085121750831604, - "learning_rate": 1.018166497407284e-06, - "loss": 0.0047, - "step": 1440 - }, - { - "epoch": 7.029268292682927, - "grad_norm": 1.0291047096252441, - "learning_rate": 1.0150825865472813e-06, - "loss": 0.0044, - "step": 1441 - }, - { - "epoch": 7.034146341463415, - "grad_norm": 0.8040009140968323, - "learning_rate": 1.0120021630268542e-06, - "loss": 0.0044, - "step": 1442 - }, - { - "epoch": 7.0390243902439025, - "grad_norm": 1.3701342344284058, - "learning_rate": 1.0089252340804025e-06, - "loss": 0.0081, - "step": 1443 - }, - { - "epoch": 7.04390243902439, - "grad_norm": 2.89591646194458, - "learning_rate": 1.0058518069341152e-06, - "loss": 0.0318, - "step": 1444 - }, - { - "epoch": 7.048780487804878, - "grad_norm": 1.3153692483901978, - "learning_rate": 1.002781888805958e-06, - "loss": 0.0067, - "step": 1445 - }, - { - "epoch": 7.053658536585366, - "grad_norm": 1.4490022659301758, - "learning_rate": 9.997154869056588e-07, - "loss": 0.0064, - "step": 1446 - }, - { - "epoch": 7.058536585365854, - "grad_norm": 1.7938638925552368, - "learning_rate": 9.966526084346837e-07, - "loss": 0.0057, - "step": 1447 - }, - { - "epoch": 7.0634146341463415, - "grad_norm": 3.7182836532592773, - "learning_rate": 9.935932605862258e-07, - "loss": 0.0365, - "step": 1448 - }, - { - "epoch": 7.068292682926829, - "grad_norm": 1.7843579053878784, - "learning_rate": 9.905374505451853e-07, - "loss": 0.0345, - "step": 1449 - }, - { - "epoch": 7.073170731707317, - "grad_norm": 2.9557483196258545, - "learning_rate": 9.874851854881565e-07, - "loss": 0.0384, - "step": 1450 - }, - { - "epoch": 7.078048780487805, - "grad_norm": 1.6237356662750244, - "learning_rate": 9.844364725834058e-07, - "loss": 0.0116, - "step": 1451 - }, - { - "epoch": 7.082926829268293, - "grad_norm": 3.7120912075042725, - "learning_rate": 9.813913189908571e-07, - "loss": 0.0267, - "step": 1452 - }, - { - "epoch": 7.087804878048781, - "grad_norm": 1.9991087913513184, - "learning_rate": 9.783497318620783e-07, - "loss": 0.0376, - "step": 1453 - }, - { - "epoch": 7.092682926829268, - "grad_norm": 1.5474026203155518, - "learning_rate": 9.75311718340258e-07, - "loss": 0.0057, - "step": 1454 - }, - { - "epoch": 7.097560975609756, - "grad_norm": 2.060807943344116, - "learning_rate": 9.722772855601927e-07, - "loss": 0.0386, - "step": 1455 - }, - { - "epoch": 7.102439024390244, - "grad_norm": 1.1991411447525024, - "learning_rate": 9.692464406482727e-07, - "loss": 0.006, - "step": 1456 - }, - { - "epoch": 7.107317073170732, - "grad_norm": 1.8907703161239624, - "learning_rate": 9.662191907224582e-07, - "loss": 0.0066, - "step": 1457 - }, - { - "epoch": 7.11219512195122, - "grad_norm": 2.0351309776306152, - "learning_rate": 9.63195542892268e-07, - "loss": 0.0201, - "step": 1458 - }, - { - "epoch": 7.117073170731707, - "grad_norm": 1.3973944187164307, - "learning_rate": 9.601755042587624e-07, - "loss": 0.0112, - "step": 1459 - }, - { - "epoch": 7.121951219512195, - "grad_norm": 1.3639394044876099, - "learning_rate": 9.571590819145244e-07, - "loss": 0.0066, - "step": 1460 - }, - { - "epoch": 7.126829268292683, - "grad_norm": 1.7362885475158691, - "learning_rate": 9.541462829436426e-07, - "loss": 0.0136, - "step": 1461 - }, - { - "epoch": 7.131707317073171, - "grad_norm": 2.9414384365081787, - "learning_rate": 9.511371144217005e-07, - "loss": 0.0228, - "step": 1462 - }, - { - "epoch": 7.136585365853659, - "grad_norm": 2.944575548171997, - "learning_rate": 9.481315834157512e-07, - "loss": 0.027, - "step": 1463 - }, - { - "epoch": 7.1414634146341465, - "grad_norm": 2.4692747592926025, - "learning_rate": 9.451296969843058e-07, - "loss": 0.0152, - "step": 1464 - }, - { - "epoch": 7.146341463414634, - "grad_norm": 1.804129719734192, - "learning_rate": 9.42131462177319e-07, - "loss": 0.0071, - "step": 1465 - }, - { - "epoch": 7.151219512195122, - "grad_norm": 1.8012168407440186, - "learning_rate": 9.39136886036166e-07, - "loss": 0.0054, - "step": 1466 - }, - { - "epoch": 7.15609756097561, - "grad_norm": 1.9471648931503296, - "learning_rate": 9.361459755936316e-07, - "loss": 0.0067, - "step": 1467 - }, - { - "epoch": 7.160975609756098, - "grad_norm": 1.8837870359420776, - "learning_rate": 9.331587378738902e-07, - "loss": 0.0105, - "step": 1468 - }, - { - "epoch": 7.1658536585365855, - "grad_norm": 2.358891487121582, - "learning_rate": 9.301751798924935e-07, - "loss": 0.0331, - "step": 1469 - }, - { - "epoch": 7.170731707317073, - "grad_norm": 1.1501671075820923, - "learning_rate": 9.27195308656349e-07, - "loss": 0.0076, - "step": 1470 - }, - { - "epoch": 7.175609756097561, - "grad_norm": 2.3329083919525146, - "learning_rate": 9.24219131163705e-07, - "loss": 0.0243, - "step": 1471 - }, - { - "epoch": 7.180487804878049, - "grad_norm": 1.6030691862106323, - "learning_rate": 9.212466544041385e-07, - "loss": 0.0051, - "step": 1472 - }, - { - "epoch": 7.185365853658537, - "grad_norm": 2.005582094192505, - "learning_rate": 9.182778853585325e-07, - "loss": 0.0146, - "step": 1473 - }, - { - "epoch": 7.190243902439025, - "grad_norm": 1.86012601852417, - "learning_rate": 9.153128309990622e-07, - "loss": 0.0273, - "step": 1474 - }, - { - "epoch": 7.195121951219512, - "grad_norm": 2.218923568725586, - "learning_rate": 9.123514982891813e-07, - "loss": 0.0225, - "step": 1475 - }, - { - "epoch": 7.2, - "grad_norm": 1.9950376749038696, - "learning_rate": 9.093938941836012e-07, - "loss": 0.0156, - "step": 1476 - }, - { - "epoch": 7.204878048780488, - "grad_norm": 1.6428661346435547, - "learning_rate": 9.064400256282757e-07, - "loss": 0.0158, - "step": 1477 - }, - { - "epoch": 7.209756097560976, - "grad_norm": 1.7983390092849731, - "learning_rate": 9.034898995603894e-07, - "loss": 0.0138, - "step": 1478 - }, - { - "epoch": 7.214634146341464, - "grad_norm": 2.2069218158721924, - "learning_rate": 9.00543522908334e-07, - "loss": 0.0308, - "step": 1479 - }, - { - "epoch": 7.219512195121951, - "grad_norm": 1.4668920040130615, - "learning_rate": 8.976009025916962e-07, - "loss": 0.006, - "step": 1480 - }, - { - "epoch": 7.224390243902439, - "grad_norm": 1.8956354856491089, - "learning_rate": 8.946620455212438e-07, - "loss": 0.0121, - "step": 1481 - }, - { - "epoch": 7.229268292682927, - "grad_norm": 2.5479676723480225, - "learning_rate": 8.917269585989027e-07, - "loss": 0.0424, - "step": 1482 - }, - { - "epoch": 7.234146341463415, - "grad_norm": 1.7482987642288208, - "learning_rate": 8.887956487177462e-07, - "loss": 0.0189, - "step": 1483 - }, - { - "epoch": 7.239024390243903, - "grad_norm": 1.5023657083511353, - "learning_rate": 8.858681227619789e-07, - "loss": 0.0118, - "step": 1484 - }, - { - "epoch": 7.2439024390243905, - "grad_norm": 1.2069121599197388, - "learning_rate": 8.829443876069163e-07, - "loss": 0.0043, - "step": 1485 - }, - { - "epoch": 7.248780487804878, - "grad_norm": 1.5843572616577148, - "learning_rate": 8.800244501189722e-07, - "loss": 0.0111, - "step": 1486 - }, - { - "epoch": 7.253658536585366, - "grad_norm": 2.541588544845581, - "learning_rate": 8.771083171556407e-07, - "loss": 0.0582, - "step": 1487 - }, - { - "epoch": 7.258536585365854, - "grad_norm": 0.9306992292404175, - "learning_rate": 8.741959955654833e-07, - "loss": 0.0051, - "step": 1488 - }, - { - "epoch": 7.263414634146342, - "grad_norm": 1.4105901718139648, - "learning_rate": 8.712874921881082e-07, - "loss": 0.0175, - "step": 1489 - }, - { - "epoch": 7.2682926829268295, - "grad_norm": 2.8943028450012207, - "learning_rate": 8.683828138541559e-07, - "loss": 0.0827, - "step": 1490 - }, - { - "epoch": 7.273170731707317, - "grad_norm": 2.512991428375244, - "learning_rate": 8.654819673852874e-07, - "loss": 0.0347, - "step": 1491 - }, - { - "epoch": 7.278048780487805, - "grad_norm": 1.6571681499481201, - "learning_rate": 8.625849595941608e-07, - "loss": 0.0055, - "step": 1492 - }, - { - "epoch": 7.282926829268293, - "grad_norm": 1.3162294626235962, - "learning_rate": 8.596917972844199e-07, - "loss": 0.0043, - "step": 1493 - }, - { - "epoch": 7.287804878048781, - "grad_norm": 1.761405110359192, - "learning_rate": 8.568024872506792e-07, - "loss": 0.0176, - "step": 1494 - }, - { - "epoch": 7.2926829268292686, - "grad_norm": 0.7546011805534363, - "learning_rate": 8.539170362785043e-07, - "loss": 0.0025, - "step": 1495 - }, - { - "epoch": 7.297560975609756, - "grad_norm": 1.6910885572433472, - "learning_rate": 8.510354511443975e-07, - "loss": 0.0093, - "step": 1496 - }, - { - "epoch": 7.302439024390244, - "grad_norm": 1.6627765893936157, - "learning_rate": 8.48157738615784e-07, - "loss": 0.0066, - "step": 1497 - }, - { - "epoch": 7.307317073170732, - "grad_norm": 0.8881242871284485, - "learning_rate": 8.452839054509926e-07, - "loss": 0.0055, - "step": 1498 - }, - { - "epoch": 7.31219512195122, - "grad_norm": 1.0791494846343994, - "learning_rate": 8.42413958399241e-07, - "loss": 0.0059, - "step": 1499 - }, - { - "epoch": 7.317073170731708, - "grad_norm": 1.5198945999145508, - "learning_rate": 8.39547904200623e-07, - "loss": 0.0049, - "step": 1500 - }, - { - "epoch": 7.321951219512195, - "grad_norm": 1.7168906927108765, - "learning_rate": 8.366857495860869e-07, - "loss": 0.0204, - "step": 1501 - }, - { - "epoch": 7.326829268292683, - "grad_norm": 1.70030677318573, - "learning_rate": 8.338275012774247e-07, - "loss": 0.0161, - "step": 1502 - }, - { - "epoch": 7.331707317073171, - "grad_norm": 2.1044130325317383, - "learning_rate": 8.309731659872522e-07, - "loss": 0.0088, - "step": 1503 - }, - { - "epoch": 7.336585365853659, - "grad_norm": 1.5040123462677002, - "learning_rate": 8.281227504189992e-07, - "loss": 0.0204, - "step": 1504 - }, - { - "epoch": 7.341463414634147, - "grad_norm": 1.6814212799072266, - "learning_rate": 8.252762612668869e-07, - "loss": 0.0238, - "step": 1505 - }, - { - "epoch": 7.3463414634146345, - "grad_norm": 2.2541606426239014, - "learning_rate": 8.224337052159154e-07, - "loss": 0.0063, - "step": 1506 - }, - { - "epoch": 7.351219512195122, - "grad_norm": 2.3999500274658203, - "learning_rate": 8.195950889418503e-07, - "loss": 0.0123, - "step": 1507 - }, - { - "epoch": 7.35609756097561, - "grad_norm": 2.8464221954345703, - "learning_rate": 8.167604191112021e-07, - "loss": 0.0296, - "step": 1508 - }, - { - "epoch": 7.360975609756098, - "grad_norm": 2.178104877471924, - "learning_rate": 8.139297023812131e-07, - "loss": 0.0148, - "step": 1509 - }, - { - "epoch": 7.365853658536586, - "grad_norm": 1.6489804983139038, - "learning_rate": 8.111029453998448e-07, - "loss": 0.0057, - "step": 1510 - }, - { - "epoch": 7.3707317073170735, - "grad_norm": 1.9705169200897217, - "learning_rate": 8.082801548057553e-07, - "loss": 0.0098, - "step": 1511 - }, - { - "epoch": 7.375609756097561, - "grad_norm": 1.2231075763702393, - "learning_rate": 8.05461337228289e-07, - "loss": 0.007, - "step": 1512 - }, - { - "epoch": 7.380487804878049, - "grad_norm": 1.5212552547454834, - "learning_rate": 8.026464992874617e-07, - "loss": 0.0058, - "step": 1513 - }, - { - "epoch": 7.385365853658537, - "grad_norm": 0.5752282738685608, - "learning_rate": 7.998356475939398e-07, - "loss": 0.0011, - "step": 1514 - }, - { - "epoch": 7.390243902439025, - "grad_norm": 1.3227447271347046, - "learning_rate": 7.970287887490289e-07, - "loss": 0.0041, - "step": 1515 - }, - { - "epoch": 7.3951219512195125, - "grad_norm": 1.2051570415496826, - "learning_rate": 7.942259293446594e-07, - "loss": 0.0027, - "step": 1516 - }, - { - "epoch": 7.4, - "grad_norm": 1.4740777015686035, - "learning_rate": 7.914270759633669e-07, - "loss": 0.006, - "step": 1517 - }, - { - "epoch": 7.404878048780488, - "grad_norm": 1.8853001594543457, - "learning_rate": 7.886322351782782e-07, - "loss": 0.0066, - "step": 1518 - }, - { - "epoch": 7.409756097560976, - "grad_norm": 1.907251238822937, - "learning_rate": 7.858414135530995e-07, - "loss": 0.0133, - "step": 1519 - }, - { - "epoch": 7.414634146341464, - "grad_norm": 1.3397895097732544, - "learning_rate": 7.83054617642095e-07, - "loss": 0.0092, - "step": 1520 - }, - { - "epoch": 7.419512195121952, - "grad_norm": 2.878927707672119, - "learning_rate": 7.802718539900761e-07, - "loss": 0.0113, - "step": 1521 - }, - { - "epoch": 7.424390243902439, - "grad_norm": 1.0312106609344482, - "learning_rate": 7.774931291323826e-07, - "loss": 0.0045, - "step": 1522 - }, - { - "epoch": 7.429268292682927, - "grad_norm": 2.2703888416290283, - "learning_rate": 7.747184495948723e-07, - "loss": 0.0692, - "step": 1523 - }, - { - "epoch": 7.434146341463415, - "grad_norm": 3.0323078632354736, - "learning_rate": 7.719478218939e-07, - "loss": 0.0462, - "step": 1524 - }, - { - "epoch": 7.439024390243903, - "grad_norm": 1.4211952686309814, - "learning_rate": 7.691812525363044e-07, - "loss": 0.008, - "step": 1525 - }, - { - "epoch": 7.443902439024391, - "grad_norm": 0.9588236808776855, - "learning_rate": 7.66418748019396e-07, - "loss": 0.0042, - "step": 1526 - }, - { - "epoch": 7.4487804878048784, - "grad_norm": 2.837219476699829, - "learning_rate": 7.636603148309363e-07, - "loss": 0.0033, - "step": 1527 - }, - { - "epoch": 7.453658536585366, - "grad_norm": 1.8552638292312622, - "learning_rate": 7.609059594491253e-07, - "loss": 0.0181, - "step": 1528 - }, - { - "epoch": 7.458536585365854, - "grad_norm": 4.836069583892822, - "learning_rate": 7.581556883425886e-07, - "loss": 0.1868, - "step": 1529 - }, - { - "epoch": 7.463414634146342, - "grad_norm": 2.180760622024536, - "learning_rate": 7.55409507970358e-07, - "loss": 0.0305, - "step": 1530 - }, - { - "epoch": 7.46829268292683, - "grad_norm": 1.0799378156661987, - "learning_rate": 7.526674247818569e-07, - "loss": 0.0027, - "step": 1531 - }, - { - "epoch": 7.473170731707317, - "grad_norm": 2.1196658611297607, - "learning_rate": 7.499294452168904e-07, - "loss": 0.019, - "step": 1532 - }, - { - "epoch": 7.478048780487805, - "grad_norm": 1.6932553052902222, - "learning_rate": 7.471955757056227e-07, - "loss": 0.0101, - "step": 1533 - }, - { - "epoch": 7.482926829268292, - "grad_norm": 1.3473751544952393, - "learning_rate": 7.444658226685656e-07, - "loss": 0.0066, - "step": 1534 - }, - { - "epoch": 7.487804878048781, - "grad_norm": 2.3404016494750977, - "learning_rate": 7.417401925165666e-07, - "loss": 0.0139, - "step": 1535 - }, - { - "epoch": 7.492682926829268, - "grad_norm": 1.2845433950424194, - "learning_rate": 7.390186916507869e-07, - "loss": 0.0053, - "step": 1536 - }, - { - "epoch": 7.4975609756097565, - "grad_norm": 1.0809649229049683, - "learning_rate": 7.363013264626914e-07, - "loss": 0.0031, - "step": 1537 - }, - { - "epoch": 7.5024390243902435, - "grad_norm": 2.2649292945861816, - "learning_rate": 7.335881033340334e-07, - "loss": 0.0257, - "step": 1538 - }, - { - "epoch": 7.507317073170732, - "grad_norm": 1.3488918542861938, - "learning_rate": 7.308790286368373e-07, - "loss": 0.0092, - "step": 1539 - }, - { - "epoch": 7.512195121951219, - "grad_norm": 2.239190101623535, - "learning_rate": 7.281741087333846e-07, - "loss": 0.024, - "step": 1540 - }, - { - "epoch": 7.517073170731708, - "grad_norm": 1.9454522132873535, - "learning_rate": 7.254733499761993e-07, - "loss": 0.0177, - "step": 1541 - }, - { - "epoch": 7.521951219512195, - "grad_norm": 1.9299415349960327, - "learning_rate": 7.22776758708035e-07, - "loss": 0.0439, - "step": 1542 - }, - { - "epoch": 7.526829268292683, - "grad_norm": 2.2676074504852295, - "learning_rate": 7.200843412618555e-07, - "loss": 0.0387, - "step": 1543 - }, - { - "epoch": 7.53170731707317, - "grad_norm": 1.2385426759719849, - "learning_rate": 7.173961039608227e-07, - "loss": 0.0082, - "step": 1544 - }, - { - "epoch": 7.536585365853659, - "grad_norm": 1.8637615442276, - "learning_rate": 7.147120531182828e-07, - "loss": 0.0194, - "step": 1545 - }, - { - "epoch": 7.541463414634146, - "grad_norm": 1.6695958375930786, - "learning_rate": 7.120321950377487e-07, - "loss": 0.006, - "step": 1546 - }, - { - "epoch": 7.546341463414635, - "grad_norm": 1.916746735572815, - "learning_rate": 7.093565360128863e-07, - "loss": 0.0104, - "step": 1547 - }, - { - "epoch": 7.5512195121951216, - "grad_norm": 1.6002378463745117, - "learning_rate": 7.066850823275024e-07, - "loss": 0.0173, - "step": 1548 - }, - { - "epoch": 7.55609756097561, - "grad_norm": 1.5249438285827637, - "learning_rate": 7.040178402555245e-07, - "loss": 0.0088, - "step": 1549 - }, - { - "epoch": 7.560975609756097, - "grad_norm": 2.1726534366607666, - "learning_rate": 7.013548160609901e-07, - "loss": 0.0098, - "step": 1550 - }, - { - "epoch": 7.565853658536585, - "grad_norm": 1.901904582977295, - "learning_rate": 6.986960159980327e-07, - "loss": 0.0196, - "step": 1551 - }, - { - "epoch": 7.570731707317073, - "grad_norm": 2.577242136001587, - "learning_rate": 6.960414463108631e-07, - "loss": 0.021, - "step": 1552 - }, - { - "epoch": 7.575609756097561, - "grad_norm": 1.4463082551956177, - "learning_rate": 6.933911132337575e-07, - "loss": 0.0076, - "step": 1553 - }, - { - "epoch": 7.580487804878048, - "grad_norm": 2.5811946392059326, - "learning_rate": 6.907450229910443e-07, - "loss": 0.0204, - "step": 1554 - }, - { - "epoch": 7.585365853658536, - "grad_norm": 1.0530297756195068, - "learning_rate": 6.881031817970848e-07, - "loss": 0.0046, - "step": 1555 - }, - { - "epoch": 7.590243902439024, - "grad_norm": 2.995915651321411, - "learning_rate": 6.854655958562625e-07, - "loss": 0.0566, - "step": 1556 - }, - { - "epoch": 7.595121951219512, - "grad_norm": 1.253089189529419, - "learning_rate": 6.82832271362969e-07, - "loss": 0.0048, - "step": 1557 - }, - { - "epoch": 7.6, - "grad_norm": 2.830667495727539, - "learning_rate": 6.802032145015855e-07, - "loss": 0.0351, - "step": 1558 - }, - { - "epoch": 7.6048780487804875, - "grad_norm": 2.8280539512634277, - "learning_rate": 6.775784314464717e-07, - "loss": 0.0171, - "step": 1559 - }, - { - "epoch": 7.609756097560975, - "grad_norm": 1.7876580953598022, - "learning_rate": 6.749579283619492e-07, - "loss": 0.01, - "step": 1560 - }, - { - "epoch": 7.614634146341463, - "grad_norm": 1.540212869644165, - "learning_rate": 6.723417114022907e-07, - "loss": 0.0162, - "step": 1561 - }, - { - "epoch": 7.619512195121951, - "grad_norm": 2.5126969814300537, - "learning_rate": 6.697297867117e-07, - "loss": 0.0237, - "step": 1562 - }, - { - "epoch": 7.624390243902439, - "grad_norm": 1.5419458150863647, - "learning_rate": 6.671221604243014e-07, - "loss": 0.0116, - "step": 1563 - }, - { - "epoch": 7.6292682926829265, - "grad_norm": 3.469961404800415, - "learning_rate": 6.645188386641257e-07, - "loss": 0.0506, - "step": 1564 - }, - { - "epoch": 7.634146341463414, - "grad_norm": 0.8771130442619324, - "learning_rate": 6.61919827545093e-07, - "loss": 0.002, - "step": 1565 - }, - { - "epoch": 7.639024390243902, - "grad_norm": 3.036559820175171, - "learning_rate": 6.593251331709993e-07, - "loss": 0.0673, - "step": 1566 - }, - { - "epoch": 7.64390243902439, - "grad_norm": 3.379220724105835, - "learning_rate": 6.567347616355049e-07, - "loss": 0.063, - "step": 1567 - }, - { - "epoch": 7.648780487804878, - "grad_norm": 0.7666990756988525, - "learning_rate": 6.541487190221163e-07, - "loss": 0.003, - "step": 1568 - }, - { - "epoch": 7.6536585365853655, - "grad_norm": 1.2181665897369385, - "learning_rate": 6.515670114041725e-07, - "loss": 0.0037, - "step": 1569 - }, - { - "epoch": 7.658536585365853, - "grad_norm": 1.0194541215896606, - "learning_rate": 6.489896448448349e-07, - "loss": 0.0043, - "step": 1570 - }, - { - "epoch": 7.663414634146341, - "grad_norm": 2.2625741958618164, - "learning_rate": 6.464166253970672e-07, - "loss": 0.0144, - "step": 1571 - }, - { - "epoch": 7.668292682926829, - "grad_norm": 1.0256692171096802, - "learning_rate": 6.43847959103624e-07, - "loss": 0.0029, - "step": 1572 - }, - { - "epoch": 7.673170731707317, - "grad_norm": 2.0418128967285156, - "learning_rate": 6.412836519970383e-07, - "loss": 0.0144, - "step": 1573 - }, - { - "epoch": 7.678048780487805, - "grad_norm": 0.8498746752738953, - "learning_rate": 6.387237100996041e-07, - "loss": 0.0026, - "step": 1574 - }, - { - "epoch": 7.682926829268292, - "grad_norm": 1.1043775081634521, - "learning_rate": 6.361681394233631e-07, - "loss": 0.0093, - "step": 1575 - }, - { - "epoch": 7.68780487804878, - "grad_norm": 1.064835786819458, - "learning_rate": 6.336169459700933e-07, - "loss": 0.0081, - "step": 1576 - }, - { - "epoch": 7.692682926829268, - "grad_norm": 1.2024056911468506, - "learning_rate": 6.310701357312909e-07, - "loss": 0.0054, - "step": 1577 - }, - { - "epoch": 7.697560975609756, - "grad_norm": 1.9509804248809814, - "learning_rate": 6.285277146881588e-07, - "loss": 0.0051, - "step": 1578 - }, - { - "epoch": 7.702439024390244, - "grad_norm": 1.8738386631011963, - "learning_rate": 6.259896888115904e-07, - "loss": 0.0118, - "step": 1579 - }, - { - "epoch": 7.7073170731707314, - "grad_norm": 1.356726884841919, - "learning_rate": 6.234560640621606e-07, - "loss": 0.009, - "step": 1580 - }, - { - "epoch": 7.712195121951219, - "grad_norm": 0.6530736684799194, - "learning_rate": 6.209268463901047e-07, - "loss": 0.0015, - "step": 1581 - }, - { - "epoch": 7.717073170731707, - "grad_norm": 1.3714262247085571, - "learning_rate": 6.184020417353084e-07, - "loss": 0.0051, - "step": 1582 - }, - { - "epoch": 7.721951219512195, - "grad_norm": 3.015583038330078, - "learning_rate": 6.158816560272962e-07, - "loss": 0.0383, - "step": 1583 - }, - { - "epoch": 7.726829268292683, - "grad_norm": 3.2355704307556152, - "learning_rate": 6.133656951852113e-07, - "loss": 0.0422, - "step": 1584 - }, - { - "epoch": 7.7317073170731705, - "grad_norm": 1.2933087348937988, - "learning_rate": 6.10854165117806e-07, - "loss": 0.0082, - "step": 1585 - }, - { - "epoch": 7.736585365853658, - "grad_norm": 1.6866157054901123, - "learning_rate": 6.083470717234285e-07, - "loss": 0.0052, - "step": 1586 - }, - { - "epoch": 7.741463414634146, - "grad_norm": 1.4597362279891968, - "learning_rate": 6.058444208900061e-07, - "loss": 0.0094, - "step": 1587 - }, - { - "epoch": 7.746341463414634, - "grad_norm": 0.9200596213340759, - "learning_rate": 6.033462184950317e-07, - "loss": 0.0034, - "step": 1588 - }, - { - "epoch": 7.751219512195122, - "grad_norm": 1.707422137260437, - "learning_rate": 6.008524704055535e-07, - "loss": 0.0141, - "step": 1589 - }, - { - "epoch": 7.7560975609756095, - "grad_norm": 1.8554565906524658, - "learning_rate": 5.983631824781572e-07, - "loss": 0.0108, - "step": 1590 - }, - { - "epoch": 7.760975609756097, - "grad_norm": 1.5421279668807983, - "learning_rate": 5.95878360558953e-07, - "loss": 0.0075, - "step": 1591 - }, - { - "epoch": 7.765853658536585, - "grad_norm": 1.5643326044082642, - "learning_rate": 5.933980104835652e-07, - "loss": 0.018, - "step": 1592 - }, - { - "epoch": 7.770731707317073, - "grad_norm": 1.7024025917053223, - "learning_rate": 5.909221380771132e-07, - "loss": 0.0207, - "step": 1593 - }, - { - "epoch": 7.775609756097561, - "grad_norm": 1.820544719696045, - "learning_rate": 5.884507491542024e-07, - "loss": 0.0217, - "step": 1594 - }, - { - "epoch": 7.780487804878049, - "grad_norm": 1.6761897802352905, - "learning_rate": 5.859838495189068e-07, - "loss": 0.0055, - "step": 1595 - }, - { - "epoch": 7.785365853658536, - "grad_norm": 2.3035616874694824, - "learning_rate": 5.835214449647602e-07, - "loss": 0.0147, - "step": 1596 - }, - { - "epoch": 7.790243902439024, - "grad_norm": 2.0507681369781494, - "learning_rate": 5.810635412747373e-07, - "loss": 0.0065, - "step": 1597 - }, - { - "epoch": 7.795121951219512, - "grad_norm": 1.3789564371109009, - "learning_rate": 5.786101442212422e-07, - "loss": 0.0077, - "step": 1598 - }, - { - "epoch": 7.8, - "grad_norm": 3.313107490539551, - "learning_rate": 5.761612595660979e-07, - "loss": 0.0699, - "step": 1599 - }, - { - "epoch": 7.804878048780488, - "grad_norm": 1.2391237020492554, - "learning_rate": 5.737168930605272e-07, - "loss": 0.0017, - "step": 1600 - }, - { - "epoch": 7.809756097560975, - "grad_norm": 1.1187714338302612, - "learning_rate": 5.712770504451426e-07, - "loss": 0.0101, - "step": 1601 - }, - { - "epoch": 7.814634146341463, - "grad_norm": 2.7611069679260254, - "learning_rate": 5.688417374499336e-07, - "loss": 0.0143, - "step": 1602 - }, - { - "epoch": 7.819512195121951, - "grad_norm": 1.627295732498169, - "learning_rate": 5.664109597942504e-07, - "loss": 0.0062, - "step": 1603 - }, - { - "epoch": 7.824390243902439, - "grad_norm": 4.538354396820068, - "learning_rate": 5.639847231867917e-07, - "loss": 0.1058, - "step": 1604 - }, - { - "epoch": 7.829268292682927, - "grad_norm": 1.783469319343567, - "learning_rate": 5.61563033325594e-07, - "loss": 0.0178, - "step": 1605 - }, - { - "epoch": 7.8341463414634145, - "grad_norm": 2.259584665298462, - "learning_rate": 5.591458958980123e-07, - "loss": 0.0204, - "step": 1606 - }, - { - "epoch": 7.839024390243902, - "grad_norm": 2.0741965770721436, - "learning_rate": 5.567333165807115e-07, - "loss": 0.0201, - "step": 1607 - }, - { - "epoch": 7.84390243902439, - "grad_norm": 0.8751707077026367, - "learning_rate": 5.543253010396538e-07, - "loss": 0.0077, - "step": 1608 - }, - { - "epoch": 7.848780487804878, - "grad_norm": 1.7383732795715332, - "learning_rate": 5.519218549300806e-07, - "loss": 0.0176, - "step": 1609 - }, - { - "epoch": 7.853658536585366, - "grad_norm": 2.0462191104888916, - "learning_rate": 5.495229838965021e-07, - "loss": 0.031, - "step": 1610 - }, - { - "epoch": 7.8585365853658535, - "grad_norm": 1.3201459646224976, - "learning_rate": 5.471286935726866e-07, - "loss": 0.0062, - "step": 1611 - }, - { - "epoch": 7.863414634146341, - "grad_norm": 2.9285616874694824, - "learning_rate": 5.447389895816416e-07, - "loss": 0.0615, - "step": 1612 - }, - { - "epoch": 7.868292682926829, - "grad_norm": 3.1918647289276123, - "learning_rate": 5.423538775356049e-07, - "loss": 0.0377, - "step": 1613 - }, - { - "epoch": 7.873170731707317, - "grad_norm": 1.406246542930603, - "learning_rate": 5.399733630360287e-07, - "loss": 0.0122, - "step": 1614 - }, - { - "epoch": 7.878048780487805, - "grad_norm": 1.7651537656784058, - "learning_rate": 5.375974516735713e-07, - "loss": 0.015, - "step": 1615 - }, - { - "epoch": 7.882926829268293, - "grad_norm": 1.9614673852920532, - "learning_rate": 5.352261490280767e-07, - "loss": 0.0058, - "step": 1616 - }, - { - "epoch": 7.88780487804878, - "grad_norm": 1.6031639575958252, - "learning_rate": 5.328594606685661e-07, - "loss": 0.0041, - "step": 1617 - }, - { - "epoch": 7.892682926829268, - "grad_norm": 0.9787303805351257, - "learning_rate": 5.304973921532264e-07, - "loss": 0.0067, - "step": 1618 - }, - { - "epoch": 7.897560975609756, - "grad_norm": 1.2693779468536377, - "learning_rate": 5.281399490293923e-07, - "loss": 0.0064, - "step": 1619 - }, - { - "epoch": 7.902439024390244, - "grad_norm": 1.8421361446380615, - "learning_rate": 5.257871368335357e-07, - "loss": 0.0182, - "step": 1620 - }, - { - "epoch": 7.907317073170732, - "grad_norm": 0.9667096138000488, - "learning_rate": 5.234389610912552e-07, - "loss": 0.0024, - "step": 1621 - }, - { - "epoch": 7.912195121951219, - "grad_norm": 3.2266018390655518, - "learning_rate": 5.210954273172578e-07, - "loss": 0.02, - "step": 1622 - }, - { - "epoch": 7.917073170731707, - "grad_norm": 1.5821634531021118, - "learning_rate": 5.187565410153497e-07, - "loss": 0.024, - "step": 1623 - }, - { - "epoch": 7.921951219512195, - "grad_norm": 1.9864275455474854, - "learning_rate": 5.164223076784239e-07, - "loss": 0.0103, - "step": 1624 - }, - { - "epoch": 7.926829268292683, - "grad_norm": 1.866466999053955, - "learning_rate": 5.14092732788444e-07, - "loss": 0.0268, - "step": 1625 - }, - { - "epoch": 7.931707317073171, - "grad_norm": 1.165686011314392, - "learning_rate": 5.117678218164337e-07, - "loss": 0.0085, - "step": 1626 - }, - { - "epoch": 7.9365853658536585, - "grad_norm": 1.1883208751678467, - "learning_rate": 5.094475802224644e-07, - "loss": 0.006, - "step": 1627 - }, - { - "epoch": 7.941463414634146, - "grad_norm": 1.5121057033538818, - "learning_rate": 5.071320134556404e-07, - "loss": 0.003, - "step": 1628 - }, - { - "epoch": 7.946341463414634, - "grad_norm": 1.1923614740371704, - "learning_rate": 5.048211269540868e-07, - "loss": 0.0064, - "step": 1629 - }, - { - "epoch": 7.951219512195122, - "grad_norm": 1.33751380443573, - "learning_rate": 5.025149261449391e-07, - "loss": 0.0082, - "step": 1630 - }, - { - "epoch": 7.95609756097561, - "grad_norm": 1.9143925905227661, - "learning_rate": 5.002134164443262e-07, - "loss": 0.0202, - "step": 1631 - }, - { - "epoch": 7.9609756097560975, - "grad_norm": 1.2547078132629395, - "learning_rate": 4.979166032573607e-07, - "loss": 0.0033, - "step": 1632 - }, - { - "epoch": 7.965853658536585, - "grad_norm": 2.3050332069396973, - "learning_rate": 4.956244919781247e-07, - "loss": 0.052, - "step": 1633 - }, - { - "epoch": 7.970731707317073, - "grad_norm": 1.4462478160858154, - "learning_rate": 4.933370879896604e-07, - "loss": 0.0049, - "step": 1634 - }, - { - "epoch": 7.975609756097561, - "grad_norm": 1.519913911819458, - "learning_rate": 4.91054396663952e-07, - "loss": 0.0102, - "step": 1635 - }, - { - "epoch": 7.980487804878049, - "grad_norm": 2.9544193744659424, - "learning_rate": 4.887764233619163e-07, - "loss": 0.0112, - "step": 1636 - }, - { - "epoch": 7.985365853658537, - "grad_norm": 0.9778392314910889, - "learning_rate": 4.865031734333919e-07, - "loss": 0.0032, - "step": 1637 - }, - { - "epoch": 7.990243902439024, - "grad_norm": 2.783501386642456, - "learning_rate": 4.842346522171226e-07, - "loss": 0.012, - "step": 1638 - }, - { - "epoch": 7.995121951219512, - "grad_norm": 1.5644093751907349, - "learning_rate": 4.819708650407467e-07, - "loss": 0.0184, - "step": 1639 - }, - { - "epoch": 8.0, - "grad_norm": 1.5741018056869507, - "learning_rate": 4.797118172207863e-07, - "loss": 0.0112, - "step": 1640 - }, - { - "epoch": 8.004878048780487, - "grad_norm": 0.9010241031646729, - "learning_rate": 4.774575140626317e-07, - "loss": 0.0064, - "step": 1641 - }, - { - "epoch": 8.009756097560976, - "grad_norm": 0.8204272985458374, - "learning_rate": 4.752079608605295e-07, - "loss": 0.003, - "step": 1642 - }, - { - "epoch": 8.014634146341463, - "grad_norm": 1.8131763935089111, - "learning_rate": 4.7296316289757366e-07, - "loss": 0.0063, - "step": 1643 - }, - { - "epoch": 8.019512195121951, - "grad_norm": 0.9918075799942017, - "learning_rate": 4.7072312544568844e-07, - "loss": 0.0039, - "step": 1644 - }, - { - "epoch": 8.024390243902438, - "grad_norm": 0.5097177028656006, - "learning_rate": 4.6848785376561733e-07, - "loss": 0.0028, - "step": 1645 - }, - { - "epoch": 8.029268292682927, - "grad_norm": 0.3497299253940582, - "learning_rate": 4.6625735310691396e-07, - "loss": 0.0021, - "step": 1646 - }, - { - "epoch": 8.034146341463414, - "grad_norm": 0.9271900057792664, - "learning_rate": 4.6403162870792524e-07, - "loss": 0.005, - "step": 1647 - }, - { - "epoch": 8.039024390243902, - "grad_norm": 0.951755940914154, - "learning_rate": 4.618106857957805e-07, - "loss": 0.0042, - "step": 1648 - }, - { - "epoch": 8.04390243902439, - "grad_norm": 0.6863508820533752, - "learning_rate": 4.5959452958638213e-07, - "loss": 0.0014, - "step": 1649 - }, - { - "epoch": 8.048780487804878, - "grad_norm": 0.45382270216941833, - "learning_rate": 4.573831652843888e-07, - "loss": 0.0012, - "step": 1650 - }, - { - "epoch": 8.053658536585365, - "grad_norm": 1.8319289684295654, - "learning_rate": 4.55176598083206e-07, - "loss": 0.0234, - "step": 1651 - }, - { - "epoch": 8.058536585365854, - "grad_norm": 1.2312507629394531, - "learning_rate": 4.5297483316497276e-07, - "loss": 0.0042, - "step": 1652 - }, - { - "epoch": 8.06341463414634, - "grad_norm": 1.4057971239089966, - "learning_rate": 4.5077787570055097e-07, - "loss": 0.0085, - "step": 1653 - }, - { - "epoch": 8.06829268292683, - "grad_norm": 3.3510940074920654, - "learning_rate": 4.4858573084951173e-07, - "loss": 0.0628, - "step": 1654 - }, - { - "epoch": 8.073170731707316, - "grad_norm": 0.6469231247901917, - "learning_rate": 4.463984037601224e-07, - "loss": 0.0026, - "step": 1655 - }, - { - "epoch": 8.078048780487805, - "grad_norm": 0.9491491317749023, - "learning_rate": 4.4421589956933827e-07, - "loss": 0.0021, - "step": 1656 - }, - { - "epoch": 8.082926829268292, - "grad_norm": 1.0847301483154297, - "learning_rate": 4.420382234027859e-07, - "loss": 0.0042, - "step": 1657 - }, - { - "epoch": 8.08780487804878, - "grad_norm": 0.5364987254142761, - "learning_rate": 4.398653803747532e-07, - "loss": 0.0045, - "step": 1658 - }, - { - "epoch": 8.092682926829267, - "grad_norm": 1.057804822921753, - "learning_rate": 4.3769737558817996e-07, - "loss": 0.0015, - "step": 1659 - }, - { - "epoch": 8.097560975609756, - "grad_norm": 1.2050957679748535, - "learning_rate": 4.355342141346405e-07, - "loss": 0.0124, - "step": 1660 - }, - { - "epoch": 8.102439024390243, - "grad_norm": 0.2821386754512787, - "learning_rate": 4.3337590109433505e-07, - "loss": 0.002, - "step": 1661 - }, - { - "epoch": 8.107317073170732, - "grad_norm": 0.7883970141410828, - "learning_rate": 4.3122244153607914e-07, - "loss": 0.0013, - "step": 1662 - }, - { - "epoch": 8.112195121951219, - "grad_norm": 1.1907166242599487, - "learning_rate": 4.2907384051728754e-07, - "loss": 0.0201, - "step": 1663 - }, - { - "epoch": 8.117073170731707, - "grad_norm": 1.3646314144134521, - "learning_rate": 4.2693010308396566e-07, - "loss": 0.0039, - "step": 1664 - }, - { - "epoch": 8.121951219512194, - "grad_norm": 2.0689423084259033, - "learning_rate": 4.247912342706975e-07, - "loss": 0.0035, - "step": 1665 - }, - { - "epoch": 8.126829268292683, - "grad_norm": 0.4086499810218811, - "learning_rate": 4.22657239100632e-07, - "loss": 0.0009, - "step": 1666 - }, - { - "epoch": 8.13170731707317, - "grad_norm": 0.9431869387626648, - "learning_rate": 4.2052812258547265e-07, - "loss": 0.0018, - "step": 1667 - }, - { - "epoch": 8.136585365853659, - "grad_norm": 0.9063575863838196, - "learning_rate": 4.184038897254655e-07, - "loss": 0.0021, - "step": 1668 - }, - { - "epoch": 8.141463414634146, - "grad_norm": 2.707298517227173, - "learning_rate": 4.1628454550938697e-07, - "loss": 0.019, - "step": 1669 - }, - { - "epoch": 8.146341463414634, - "grad_norm": 1.687988042831421, - "learning_rate": 4.141700949145322e-07, - "loss": 0.0144, - "step": 1670 - }, - { - "epoch": 8.151219512195121, - "grad_norm": 0.8905831575393677, - "learning_rate": 4.1206054290670537e-07, - "loss": 0.0088, - "step": 1671 - }, - { - "epoch": 8.15609756097561, - "grad_norm": 1.418512225151062, - "learning_rate": 4.0995589444020433e-07, - "loss": 0.0083, - "step": 1672 - }, - { - "epoch": 8.160975609756097, - "grad_norm": 1.1676236391067505, - "learning_rate": 4.0785615445781106e-07, - "loss": 0.0027, - "step": 1673 - }, - { - "epoch": 8.165853658536586, - "grad_norm": 1.5615407228469849, - "learning_rate": 4.057613278907818e-07, - "loss": 0.0089, - "step": 1674 - }, - { - "epoch": 8.170731707317072, - "grad_norm": 1.0604172945022583, - "learning_rate": 4.036714196588318e-07, - "loss": 0.0034, - "step": 1675 - }, - { - "epoch": 8.175609756097561, - "grad_norm": 1.3175733089447021, - "learning_rate": 4.015864346701251e-07, - "loss": 0.0021, - "step": 1676 - }, - { - "epoch": 8.180487804878048, - "grad_norm": 0.2539370059967041, - "learning_rate": 3.99506377821266e-07, - "loss": 0.0005, - "step": 1677 - }, - { - "epoch": 8.185365853658537, - "grad_norm": 0.8106228113174438, - "learning_rate": 3.97431253997283e-07, - "loss": 0.003, - "step": 1678 - }, - { - "epoch": 8.190243902439024, - "grad_norm": 0.6703351140022278, - "learning_rate": 3.9536106807161857e-07, - "loss": 0.0028, - "step": 1679 - }, - { - "epoch": 8.195121951219512, - "grad_norm": 1.2921632528305054, - "learning_rate": 3.932958249061214e-07, - "loss": 0.0097, - "step": 1680 - }, - { - "epoch": 8.2, - "grad_norm": 0.7795253992080688, - "learning_rate": 3.9123552935102976e-07, - "loss": 0.004, - "step": 1681 - }, - { - "epoch": 8.204878048780488, - "grad_norm": 1.3402642011642456, - "learning_rate": 3.891801862449629e-07, - "loss": 0.0189, - "step": 1682 - }, - { - "epoch": 8.209756097560975, - "grad_norm": 0.6951391696929932, - "learning_rate": 3.8712980041490905e-07, - "loss": 0.0038, - "step": 1683 - }, - { - "epoch": 8.214634146341464, - "grad_norm": 0.8145114183425903, - "learning_rate": 3.850843766762155e-07, - "loss": 0.0038, - "step": 1684 - }, - { - "epoch": 8.21951219512195, - "grad_norm": 0.30702775716781616, - "learning_rate": 3.830439198325747e-07, - "loss": 0.0008, - "step": 1685 - }, - { - "epoch": 8.22439024390244, - "grad_norm": 0.45050138235092163, - "learning_rate": 3.81008434676014e-07, - "loss": 0.0013, - "step": 1686 - }, - { - "epoch": 8.229268292682926, - "grad_norm": 0.7875486612319946, - "learning_rate": 3.789779259868864e-07, - "loss": 0.0016, - "step": 1687 - }, - { - "epoch": 8.234146341463415, - "grad_norm": 0.9437265396118164, - "learning_rate": 3.769523985338566e-07, - "loss": 0.0045, - "step": 1688 - }, - { - "epoch": 8.239024390243902, - "grad_norm": 1.2928845882415771, - "learning_rate": 3.749318570738897e-07, - "loss": 0.0057, - "step": 1689 - }, - { - "epoch": 8.24390243902439, - "grad_norm": 0.9615103006362915, - "learning_rate": 3.7291630635224397e-07, - "loss": 0.0026, - "step": 1690 - }, - { - "epoch": 8.248780487804877, - "grad_norm": 0.8654932975769043, - "learning_rate": 3.709057511024541e-07, - "loss": 0.0056, - "step": 1691 - }, - { - "epoch": 8.253658536585366, - "grad_norm": 1.1101908683776855, - "learning_rate": 3.689001960463243e-07, - "loss": 0.0019, - "step": 1692 - }, - { - "epoch": 8.258536585365853, - "grad_norm": 0.9586653709411621, - "learning_rate": 3.668996458939156e-07, - "loss": 0.003, - "step": 1693 - }, - { - "epoch": 8.263414634146342, - "grad_norm": 1.1638360023498535, - "learning_rate": 3.649041053435354e-07, - "loss": 0.0031, - "step": 1694 - }, - { - "epoch": 8.268292682926829, - "grad_norm": 0.41364336013793945, - "learning_rate": 3.62913579081724e-07, - "loss": 0.0012, - "step": 1695 - }, - { - "epoch": 8.273170731707317, - "grad_norm": 1.1794198751449585, - "learning_rate": 3.609280717832489e-07, - "loss": 0.0067, - "step": 1696 - }, - { - "epoch": 8.278048780487804, - "grad_norm": 0.7281041741371155, - "learning_rate": 3.5894758811108795e-07, - "loss": 0.002, - "step": 1697 - }, - { - "epoch": 8.282926829268293, - "grad_norm": 0.42419376969337463, - "learning_rate": 3.5697213271642164e-07, - "loss": 0.0008, - "step": 1698 - }, - { - "epoch": 8.28780487804878, - "grad_norm": 0.6596572995185852, - "learning_rate": 3.5500171023862136e-07, - "loss": 0.0028, - "step": 1699 - }, - { - "epoch": 8.292682926829269, - "grad_norm": 1.236666202545166, - "learning_rate": 3.530363253052399e-07, - "loss": 0.0032, - "step": 1700 - }, - { - "epoch": 8.297560975609755, - "grad_norm": 0.977694571018219, - "learning_rate": 3.510759825319976e-07, - "loss": 0.0068, - "step": 1701 - }, - { - "epoch": 8.302439024390244, - "grad_norm": 1.0168365240097046, - "learning_rate": 3.491206865227739e-07, - "loss": 0.0017, - "step": 1702 - }, - { - "epoch": 8.307317073170731, - "grad_norm": 2.269639253616333, - "learning_rate": 3.4717044186959676e-07, - "loss": 0.0398, - "step": 1703 - }, - { - "epoch": 8.31219512195122, - "grad_norm": 1.0657192468643188, - "learning_rate": 3.452252531526301e-07, - "loss": 0.0049, - "step": 1704 - }, - { - "epoch": 8.317073170731707, - "grad_norm": 1.50715970993042, - "learning_rate": 3.432851249401628e-07, - "loss": 0.0164, - "step": 1705 - }, - { - "epoch": 8.321951219512195, - "grad_norm": 0.701214611530304, - "learning_rate": 3.413500617886023e-07, - "loss": 0.0038, - "step": 1706 - }, - { - "epoch": 8.326829268292682, - "grad_norm": 1.6810981035232544, - "learning_rate": 3.394200682424578e-07, - "loss": 0.0118, - "step": 1707 - }, - { - "epoch": 8.331707317073171, - "grad_norm": 1.4712997674942017, - "learning_rate": 3.374951488343328e-07, - "loss": 0.006, - "step": 1708 - }, - { - "epoch": 8.336585365853658, - "grad_norm": 0.6115317940711975, - "learning_rate": 3.355753080849164e-07, - "loss": 0.0011, - "step": 1709 - }, - { - "epoch": 8.341463414634147, - "grad_norm": 0.8171163201332092, - "learning_rate": 3.3366055050296776e-07, - "loss": 0.0024, - "step": 1710 - }, - { - "epoch": 8.346341463414634, - "grad_norm": 0.7722201943397522, - "learning_rate": 3.3175088058530925e-07, - "loss": 0.0028, - "step": 1711 - }, - { - "epoch": 8.351219512195122, - "grad_norm": 3.0709617137908936, - "learning_rate": 3.2984630281681556e-07, - "loss": 0.0109, - "step": 1712 - }, - { - "epoch": 8.35609756097561, - "grad_norm": 1.7634369134902954, - "learning_rate": 3.2794682167040125e-07, - "loss": 0.0031, - "step": 1713 - }, - { - "epoch": 8.360975609756098, - "grad_norm": 1.7657215595245361, - "learning_rate": 3.2605244160701155e-07, - "loss": 0.01, - "step": 1714 - }, - { - "epoch": 8.365853658536585, - "grad_norm": 1.432230830192566, - "learning_rate": 3.2416316707561316e-07, - "loss": 0.0042, - "step": 1715 - }, - { - "epoch": 8.370731707317074, - "grad_norm": 0.465900719165802, - "learning_rate": 3.2227900251318055e-07, - "loss": 0.0021, - "step": 1716 - }, - { - "epoch": 8.37560975609756, - "grad_norm": 1.3770387172698975, - "learning_rate": 3.2039995234468854e-07, - "loss": 0.0031, - "step": 1717 - }, - { - "epoch": 8.38048780487805, - "grad_norm": 0.4842236638069153, - "learning_rate": 3.1852602098309984e-07, - "loss": 0.0009, - "step": 1718 - }, - { - "epoch": 8.385365853658536, - "grad_norm": 0.6840565204620361, - "learning_rate": 3.1665721282935683e-07, - "loss": 0.0047, - "step": 1719 - }, - { - "epoch": 8.390243902439025, - "grad_norm": 0.5206313729286194, - "learning_rate": 3.147935322723694e-07, - "loss": 0.0026, - "step": 1720 - }, - { - "epoch": 8.395121951219512, - "grad_norm": 1.131412386894226, - "learning_rate": 3.1293498368900414e-07, - "loss": 0.0019, - "step": 1721 - }, - { - "epoch": 8.4, - "grad_norm": 0.5872076153755188, - "learning_rate": 3.1108157144407765e-07, - "loss": 0.0009, - "step": 1722 - }, - { - "epoch": 8.404878048780487, - "grad_norm": 1.1455132961273193, - "learning_rate": 3.092332998903416e-07, - "loss": 0.0047, - "step": 1723 - }, - { - "epoch": 8.409756097560976, - "grad_norm": 1.4331532716751099, - "learning_rate": 3.073901733684748e-07, - "loss": 0.0162, - "step": 1724 - }, - { - "epoch": 8.414634146341463, - "grad_norm": 0.8186633586883545, - "learning_rate": 3.055521962070751e-07, - "loss": 0.0078, - "step": 1725 - }, - { - "epoch": 8.419512195121952, - "grad_norm": 0.9004407525062561, - "learning_rate": 3.0371937272264454e-07, - "loss": 0.0035, - "step": 1726 - }, - { - "epoch": 8.424390243902439, - "grad_norm": 0.8009728789329529, - "learning_rate": 3.0189170721958234e-07, - "loss": 0.0011, - "step": 1727 - }, - { - "epoch": 8.429268292682927, - "grad_norm": 0.7846589088439941, - "learning_rate": 3.000692039901756e-07, - "loss": 0.0042, - "step": 1728 - }, - { - "epoch": 8.434146341463414, - "grad_norm": 1.2301117181777954, - "learning_rate": 2.982518673145862e-07, - "loss": 0.0159, - "step": 1729 - }, - { - "epoch": 8.439024390243903, - "grad_norm": 0.8503583073616028, - "learning_rate": 2.9643970146084193e-07, - "loss": 0.0021, - "step": 1730 - }, - { - "epoch": 8.44390243902439, - "grad_norm": 1.661842942237854, - "learning_rate": 2.9463271068482955e-07, - "loss": 0.0124, - "step": 1731 - }, - { - "epoch": 8.448780487804878, - "grad_norm": 0.7799263000488281, - "learning_rate": 2.928308992302792e-07, - "loss": 0.0038, - "step": 1732 - }, - { - "epoch": 8.453658536585365, - "grad_norm": 0.6021434664726257, - "learning_rate": 2.9103427132875785e-07, - "loss": 0.0013, - "step": 1733 - }, - { - "epoch": 8.458536585365854, - "grad_norm": 1.430431604385376, - "learning_rate": 2.892428311996609e-07, - "loss": 0.0151, - "step": 1734 - }, - { - "epoch": 8.463414634146341, - "grad_norm": 1.1589592695236206, - "learning_rate": 2.8745658305019824e-07, - "loss": 0.0037, - "step": 1735 - }, - { - "epoch": 8.46829268292683, - "grad_norm": 0.7232568860054016, - "learning_rate": 2.856755310753867e-07, - "loss": 0.0046, - "step": 1736 - }, - { - "epoch": 8.473170731707317, - "grad_norm": 0.6265125274658203, - "learning_rate": 2.8389967945803984e-07, - "loss": 0.0014, - "step": 1737 - }, - { - "epoch": 8.478048780487805, - "grad_norm": 0.7115193009376526, - "learning_rate": 2.821290323687592e-07, - "loss": 0.0036, - "step": 1738 - }, - { - "epoch": 8.482926829268292, - "grad_norm": 0.5157519578933716, - "learning_rate": 2.803635939659222e-07, - "loss": 0.0016, - "step": 1739 - }, - { - "epoch": 8.487804878048781, - "grad_norm": 0.9217156767845154, - "learning_rate": 2.786033683956732e-07, - "loss": 0.0052, - "step": 1740 - }, - { - "epoch": 8.492682926829268, - "grad_norm": 4.063957691192627, - "learning_rate": 2.7684835979191664e-07, - "loss": 0.0999, - "step": 1741 - }, - { - "epoch": 8.497560975609757, - "grad_norm": 0.38870275020599365, - "learning_rate": 2.7509857227630223e-07, - "loss": 0.0009, - "step": 1742 - }, - { - "epoch": 8.502439024390243, - "grad_norm": 0.8282430768013, - "learning_rate": 2.733540099582188e-07, - "loss": 0.0026, - "step": 1743 - }, - { - "epoch": 8.507317073170732, - "grad_norm": 1.7269257307052612, - "learning_rate": 2.7161467693478493e-07, - "loss": 0.0094, - "step": 1744 - }, - { - "epoch": 8.512195121951219, - "grad_norm": 1.4464598894119263, - "learning_rate": 2.6988057729083613e-07, - "loss": 0.006, - "step": 1745 - }, - { - "epoch": 8.517073170731708, - "grad_norm": 0.9648481011390686, - "learning_rate": 2.681517150989185e-07, - "loss": 0.0043, - "step": 1746 - }, - { - "epoch": 8.521951219512195, - "grad_norm": 0.7762707471847534, - "learning_rate": 2.664280944192782e-07, - "loss": 0.0026, - "step": 1747 - }, - { - "epoch": 8.526829268292683, - "grad_norm": 0.9751222133636475, - "learning_rate": 2.64709719299851e-07, - "loss": 0.0044, - "step": 1748 - }, - { - "epoch": 8.53170731707317, - "grad_norm": 0.5906254053115845, - "learning_rate": 2.6299659377625296e-07, - "loss": 0.0008, - "step": 1749 - }, - { - "epoch": 8.536585365853659, - "grad_norm": 1.9417753219604492, - "learning_rate": 2.612887218717733e-07, - "loss": 0.0324, - "step": 1750 - }, - { - "epoch": 8.541463414634146, - "grad_norm": 0.6434907913208008, - "learning_rate": 2.5958610759736133e-07, - "loss": 0.0028, - "step": 1751 - }, - { - "epoch": 8.546341463414635, - "grad_norm": 0.8546578884124756, - "learning_rate": 2.5788875495161846e-07, - "loss": 0.0019, - "step": 1752 - }, - { - "epoch": 8.551219512195122, - "grad_norm": 0.8363909721374512, - "learning_rate": 2.561966679207917e-07, - "loss": 0.0028, - "step": 1753 - }, - { - "epoch": 8.55609756097561, - "grad_norm": 1.4901739358901978, - "learning_rate": 2.545098504787588e-07, - "loss": 0.0266, - "step": 1754 - }, - { - "epoch": 8.560975609756097, - "grad_norm": 0.6730532646179199, - "learning_rate": 2.5282830658702323e-07, - "loss": 0.0009, - "step": 1755 - }, - { - "epoch": 8.565853658536586, - "grad_norm": 0.7190845608711243, - "learning_rate": 2.511520401947032e-07, - "loss": 0.0056, - "step": 1756 - }, - { - "epoch": 8.570731707317073, - "grad_norm": 0.441381573677063, - "learning_rate": 2.494810552385232e-07, - "loss": 0.0009, - "step": 1757 - }, - { - "epoch": 8.575609756097561, - "grad_norm": 1.103507399559021, - "learning_rate": 2.47815355642804e-07, - "loss": 0.0023, - "step": 1758 - }, - { - "epoch": 8.580487804878048, - "grad_norm": 1.994994878768921, - "learning_rate": 2.461549453194523e-07, - "loss": 0.0454, - "step": 1759 - }, - { - "epoch": 8.585365853658537, - "grad_norm": 2.3645970821380615, - "learning_rate": 2.444998281679553e-07, - "loss": 0.0204, - "step": 1760 - }, - { - "epoch": 8.590243902439024, - "grad_norm": 1.7933200597763062, - "learning_rate": 2.428500080753676e-07, - "loss": 0.0387, - "step": 1761 - }, - { - "epoch": 8.595121951219513, - "grad_norm": 1.6070597171783447, - "learning_rate": 2.412054889163035e-07, - "loss": 0.0014, - "step": 1762 - }, - { - "epoch": 8.6, - "grad_norm": 0.2842216193675995, - "learning_rate": 2.3956627455292924e-07, - "loss": 0.0011, - "step": 1763 - }, - { - "epoch": 8.604878048780488, - "grad_norm": 0.8213078379631042, - "learning_rate": 2.3793236883495164e-07, - "loss": 0.003, - "step": 1764 - }, - { - "epoch": 8.609756097560975, - "grad_norm": 0.9147091507911682, - "learning_rate": 2.363037755996095e-07, - "loss": 0.0032, - "step": 1765 - }, - { - "epoch": 8.614634146341464, - "grad_norm": 1.4246805906295776, - "learning_rate": 2.3468049867166747e-07, - "loss": 0.0037, - "step": 1766 - }, - { - "epoch": 8.61951219512195, - "grad_norm": 0.5553964376449585, - "learning_rate": 2.3306254186340305e-07, - "loss": 0.0014, - "step": 1767 - }, - { - "epoch": 8.62439024390244, - "grad_norm": 1.6941331624984741, - "learning_rate": 2.314499089745989e-07, - "loss": 0.0125, - "step": 1768 - }, - { - "epoch": 8.629268292682926, - "grad_norm": 2.965517520904541, - "learning_rate": 2.2984260379253693e-07, - "loss": 0.0855, - "step": 1769 - }, - { - "epoch": 8.634146341463415, - "grad_norm": 0.9295977354049683, - "learning_rate": 2.2824063009198428e-07, - "loss": 0.0031, - "step": 1770 - }, - { - "epoch": 8.639024390243902, - "grad_norm": 0.990189254283905, - "learning_rate": 2.2664399163518786e-07, - "loss": 0.0056, - "step": 1771 - }, - { - "epoch": 8.64390243902439, - "grad_norm": 1.7282871007919312, - "learning_rate": 2.25052692171866e-07, - "loss": 0.022, - "step": 1772 - }, - { - "epoch": 8.648780487804878, - "grad_norm": 1.2093932628631592, - "learning_rate": 2.2346673543919645e-07, - "loss": 0.0025, - "step": 1773 - }, - { - "epoch": 8.653658536585366, - "grad_norm": 0.9555385112762451, - "learning_rate": 2.2188612516181067e-07, - "loss": 0.0081, - "step": 1774 - }, - { - "epoch": 8.658536585365853, - "grad_norm": 0.7467104196548462, - "learning_rate": 2.203108650517835e-07, - "loss": 0.0015, - "step": 1775 - }, - { - "epoch": 8.663414634146342, - "grad_norm": 0.893450140953064, - "learning_rate": 2.1874095880862505e-07, - "loss": 0.0023, - "step": 1776 - }, - { - "epoch": 8.668292682926829, - "grad_norm": 1.0488923788070679, - "learning_rate": 2.171764101192722e-07, - "loss": 0.002, - "step": 1777 - }, - { - "epoch": 8.673170731707318, - "grad_norm": 1.1046003103256226, - "learning_rate": 2.1561722265807827e-07, - "loss": 0.002, - "step": 1778 - }, - { - "epoch": 8.678048780487805, - "grad_norm": 0.38860198855400085, - "learning_rate": 2.1406340008680748e-07, - "loss": 0.0015, - "step": 1779 - }, - { - "epoch": 8.682926829268293, - "grad_norm": 0.9970881938934326, - "learning_rate": 2.1251494605462358e-07, - "loss": 0.0028, - "step": 1780 - }, - { - "epoch": 8.68780487804878, - "grad_norm": 0.32808956503868103, - "learning_rate": 2.1097186419808151e-07, - "loss": 0.0008, - "step": 1781 - }, - { - "epoch": 8.692682926829269, - "grad_norm": 0.25458696484565735, - "learning_rate": 2.094341581411216e-07, - "loss": 0.0012, - "step": 1782 - }, - { - "epoch": 8.697560975609756, - "grad_norm": 0.3530316948890686, - "learning_rate": 2.0790183149505733e-07, - "loss": 0.0021, - "step": 1783 - }, - { - "epoch": 8.702439024390245, - "grad_norm": 0.6706930994987488, - "learning_rate": 2.063748878585689e-07, - "loss": 0.0028, - "step": 1784 - }, - { - "epoch": 8.707317073170731, - "grad_norm": 0.9568914175033569, - "learning_rate": 2.0485333081769588e-07, - "loss": 0.0018, - "step": 1785 - }, - { - "epoch": 8.71219512195122, - "grad_norm": 1.2713409662246704, - "learning_rate": 2.0333716394582536e-07, - "loss": 0.0142, - "step": 1786 - }, - { - "epoch": 8.717073170731707, - "grad_norm": 1.7427871227264404, - "learning_rate": 2.0182639080368634e-07, - "loss": 0.0135, - "step": 1787 - }, - { - "epoch": 8.721951219512196, - "grad_norm": 0.8939143419265747, - "learning_rate": 2.003210149393417e-07, - "loss": 0.0078, - "step": 1788 - }, - { - "epoch": 8.726829268292683, - "grad_norm": 1.1459598541259766, - "learning_rate": 1.9882103988817735e-07, - "loss": 0.0066, - "step": 1789 - }, - { - "epoch": 8.731707317073171, - "grad_norm": 0.875706672668457, - "learning_rate": 1.9732646917289545e-07, - "loss": 0.0051, - "step": 1790 - }, - { - "epoch": 8.736585365853658, - "grad_norm": 0.2884235084056854, - "learning_rate": 1.958373063035071e-07, - "loss": 0.001, - "step": 1791 - }, - { - "epoch": 8.741463414634147, - "grad_norm": 1.3679368495941162, - "learning_rate": 1.9435355477732205e-07, - "loss": 0.0057, - "step": 1792 - }, - { - "epoch": 8.746341463414634, - "grad_norm": 0.5913633108139038, - "learning_rate": 1.928752180789417e-07, - "loss": 0.0023, - "step": 1793 - }, - { - "epoch": 8.751219512195123, - "grad_norm": 1.565428376197815, - "learning_rate": 1.9140229968025058e-07, - "loss": 0.0191, - "step": 1794 - }, - { - "epoch": 8.75609756097561, - "grad_norm": 1.4710811376571655, - "learning_rate": 1.8993480304040912e-07, - "loss": 0.0114, - "step": 1795 - }, - { - "epoch": 8.760975609756098, - "grad_norm": 1.803842306137085, - "learning_rate": 1.8847273160584378e-07, - "loss": 0.0046, - "step": 1796 - }, - { - "epoch": 8.765853658536585, - "grad_norm": 0.694587230682373, - "learning_rate": 1.8701608881023957e-07, - "loss": 0.0014, - "step": 1797 - }, - { - "epoch": 8.770731707317074, - "grad_norm": 0.7563489675521851, - "learning_rate": 1.855648780745342e-07, - "loss": 0.0085, - "step": 1798 - }, - { - "epoch": 8.77560975609756, - "grad_norm": 1.1587045192718506, - "learning_rate": 1.8411910280690588e-07, - "loss": 0.0034, - "step": 1799 - }, - { - "epoch": 8.78048780487805, - "grad_norm": 1.7251181602478027, - "learning_rate": 1.826787664027685e-07, - "loss": 0.0119, - "step": 1800 - }, - { - "epoch": 8.785365853658536, - "grad_norm": 1.3170053958892822, - "learning_rate": 1.8124387224476347e-07, - "loss": 0.0059, - "step": 1801 - }, - { - "epoch": 8.790243902439025, - "grad_norm": 0.927018940448761, - "learning_rate": 1.7981442370274993e-07, - "loss": 0.0021, - "step": 1802 - }, - { - "epoch": 8.795121951219512, - "grad_norm": 2.3129045963287354, - "learning_rate": 1.783904241337983e-07, - "loss": 0.0085, - "step": 1803 - }, - { - "epoch": 8.8, - "grad_norm": 1.1010651588439941, - "learning_rate": 1.7697187688218291e-07, - "loss": 0.0037, - "step": 1804 - }, - { - "epoch": 8.804878048780488, - "grad_norm": 0.3990725576877594, - "learning_rate": 1.7555878527937164e-07, - "loss": 0.0008, - "step": 1805 - }, - { - "epoch": 8.809756097560976, - "grad_norm": 1.022905707359314, - "learning_rate": 1.7415115264402065e-07, - "loss": 0.0092, - "step": 1806 - }, - { - "epoch": 8.814634146341463, - "grad_norm": 0.7391730546951294, - "learning_rate": 1.727489822819664e-07, - "loss": 0.0016, - "step": 1807 - }, - { - "epoch": 8.819512195121952, - "grad_norm": 0.5859627723693848, - "learning_rate": 1.7135227748621585e-07, - "loss": 0.0012, - "step": 1808 - }, - { - "epoch": 8.824390243902439, - "grad_norm": 1.5222235918045044, - "learning_rate": 1.699610415369407e-07, - "loss": 0.0126, - "step": 1809 - }, - { - "epoch": 8.829268292682928, - "grad_norm": 0.8635048270225525, - "learning_rate": 1.6857527770146876e-07, - "loss": 0.0086, - "step": 1810 - }, - { - "epoch": 8.834146341463414, - "grad_norm": 0.8385710120201111, - "learning_rate": 1.6719498923427697e-07, - "loss": 0.0031, - "step": 1811 - }, - { - "epoch": 8.839024390243903, - "grad_norm": 1.0619077682495117, - "learning_rate": 1.6582017937698287e-07, - "loss": 0.0083, - "step": 1812 - }, - { - "epoch": 8.84390243902439, - "grad_norm": 0.6677606701850891, - "learning_rate": 1.6445085135833732e-07, - "loss": 0.002, - "step": 1813 - }, - { - "epoch": 8.848780487804879, - "grad_norm": 0.703705370426178, - "learning_rate": 1.6308700839421793e-07, - "loss": 0.0027, - "step": 1814 - }, - { - "epoch": 8.853658536585366, - "grad_norm": 0.7628077864646912, - "learning_rate": 1.6172865368762004e-07, - "loss": 0.0028, - "step": 1815 - }, - { - "epoch": 8.858536585365854, - "grad_norm": 0.7577258348464966, - "learning_rate": 1.6037579042864876e-07, - "loss": 0.0011, - "step": 1816 - }, - { - "epoch": 8.863414634146341, - "grad_norm": 1.2882269620895386, - "learning_rate": 1.5902842179451482e-07, - "loss": 0.0082, - "step": 1817 - }, - { - "epoch": 8.86829268292683, - "grad_norm": 1.030044436454773, - "learning_rate": 1.576865509495229e-07, - "loss": 0.0068, - "step": 1818 - }, - { - "epoch": 8.873170731707317, - "grad_norm": 1.9678841829299927, - "learning_rate": 1.5635018104506627e-07, - "loss": 0.0085, - "step": 1819 - }, - { - "epoch": 8.878048780487806, - "grad_norm": 0.756213366985321, - "learning_rate": 1.5501931521962055e-07, - "loss": 0.0062, - "step": 1820 - }, - { - "epoch": 8.882926829268293, - "grad_norm": 1.1753418445587158, - "learning_rate": 1.5369395659873305e-07, - "loss": 0.0043, - "step": 1821 - }, - { - "epoch": 8.887804878048781, - "grad_norm": 0.8144367933273315, - "learning_rate": 1.5237410829501864e-07, - "loss": 0.0042, - "step": 1822 - }, - { - "epoch": 8.892682926829268, - "grad_norm": 1.0879873037338257, - "learning_rate": 1.510597734081512e-07, - "loss": 0.0077, - "step": 1823 - }, - { - "epoch": 8.897560975609757, - "grad_norm": 1.7992119789123535, - "learning_rate": 1.497509550248555e-07, - "loss": 0.0013, - "step": 1824 - }, - { - "epoch": 8.902439024390244, - "grad_norm": 1.0460071563720703, - "learning_rate": 1.4844765621890135e-07, - "loss": 0.0091, - "step": 1825 - }, - { - "epoch": 8.907317073170733, - "grad_norm": 1.5372941493988037, - "learning_rate": 1.471498800510962e-07, - "loss": 0.005, - "step": 1826 - }, - { - "epoch": 8.91219512195122, - "grad_norm": 0.3672512173652649, - "learning_rate": 1.4585762956927624e-07, - "loss": 0.0014, - "step": 1827 - }, - { - "epoch": 8.917073170731708, - "grad_norm": 1.0456454753875732, - "learning_rate": 1.4457090780830185e-07, - "loss": 0.0063, - "step": 1828 - }, - { - "epoch": 8.921951219512195, - "grad_norm": 0.9190329909324646, - "learning_rate": 1.432897177900483e-07, - "loss": 0.0065, - "step": 1829 - }, - { - "epoch": 8.926829268292684, - "grad_norm": 1.8261685371398926, - "learning_rate": 1.4201406252340038e-07, - "loss": 0.0099, - "step": 1830 - }, - { - "epoch": 8.93170731707317, - "grad_norm": 1.1341190338134766, - "learning_rate": 1.407439450042433e-07, - "loss": 0.0042, - "step": 1831 - }, - { - "epoch": 8.93658536585366, - "grad_norm": 11.465933799743652, - "learning_rate": 1.3947936821545772e-07, - "loss": 0.004, - "step": 1832 - }, - { - "epoch": 8.941463414634146, - "grad_norm": 0.5747786164283752, - "learning_rate": 1.3822033512691209e-07, - "loss": 0.0009, - "step": 1833 - }, - { - "epoch": 8.946341463414633, - "grad_norm": 1.1908810138702393, - "learning_rate": 1.369668486954545e-07, - "loss": 0.0028, - "step": 1834 - }, - { - "epoch": 8.951219512195122, - "grad_norm": 0.2560107111930847, - "learning_rate": 1.3571891186490687e-07, - "loss": 0.001, - "step": 1835 - }, - { - "epoch": 8.95609756097561, - "grad_norm": 0.5070216059684753, - "learning_rate": 1.3447652756605894e-07, - "loss": 0.0024, - "step": 1836 - }, - { - "epoch": 8.960975609756098, - "grad_norm": 0.507199227809906, - "learning_rate": 1.3323969871665897e-07, - "loss": 0.0015, - "step": 1837 - }, - { - "epoch": 8.965853658536584, - "grad_norm": 0.29779553413391113, - "learning_rate": 1.3200842822140818e-07, - "loss": 0.0007, - "step": 1838 - }, - { - "epoch": 8.970731707317073, - "grad_norm": 0.4603523015975952, - "learning_rate": 1.3078271897195572e-07, - "loss": 0.0018, - "step": 1839 - }, - { - "epoch": 8.975609756097562, - "grad_norm": 1.0771223306655884, - "learning_rate": 1.2956257384688807e-07, - "loss": 0.0063, - "step": 1840 - }, - { - "epoch": 8.980487804878049, - "grad_norm": 0.798372745513916, - "learning_rate": 1.283479957117248e-07, - "loss": 0.002, - "step": 1841 - }, - { - "epoch": 8.985365853658536, - "grad_norm": 2.3283369541168213, - "learning_rate": 1.2713898741891244e-07, - "loss": 0.0398, - "step": 1842 - }, - { - "epoch": 8.990243902439024, - "grad_norm": 0.18683794140815735, - "learning_rate": 1.2593555180781591e-07, - "loss": 0.0004, - "step": 1843 - }, - { - "epoch": 8.995121951219513, - "grad_norm": 2.2289419174194336, - "learning_rate": 1.2473769170471188e-07, - "loss": 0.0713, - "step": 1844 - }, - { - "epoch": 9.0, - "grad_norm": 0.9360214471817017, - "learning_rate": 1.2354540992278452e-07, - "loss": 0.002, - "step": 1845 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 5.3052217506136064e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-205/chat_template.jinja b/metallama3_8b/limo/checkpoint-205/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-205/config.json b/metallama3_8b/limo/checkpoint-205/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-205/generation_config.json b/metallama3_8b/limo/checkpoint-205/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-205/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-205/model-00001-of-00007.safetensors deleted file mode 100644 index 4f3f228fe15bdc90e0075d09267e062d84e8e583..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a51899dce985f0833455a381816de968663c1537b623debefbba58d752e481c7 -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-205/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-205/model-00002-of-00007.safetensors deleted file mode 100644 index 60f78db38c2076d729ceda8a3edb40457298df40..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:577a43cff2cef03038106467cc7dbd636d8ac0e0e11a91ec9b4fec4c2eb18d95 -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-205/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-205/model-00003-of-00007.safetensors deleted file mode 100644 index b844aa0d36fffbac810a6f42f70536eee879c510..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aeac92aec8834865f60f3e75a6b0d6e4e2b4be25a94cdc50fc6d13d96aeb9c34 -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-205/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-205/model-00004-of-00007.safetensors deleted file mode 100644 index 90995616be47adbca77e64df292073aa040784d5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e8ecd622e7b547d9ab41faf51dc740483b3012f632d4bf947a970e60ceb48b40 -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-205/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-205/model-00005-of-00007.safetensors deleted file mode 100644 index 00f49ff739e890b27ccedf4f70cabca3f1eb92ff..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4ce2af4338ac03eca8ec88f9fa48312c931cb9a712a30ae1fdba2f25230f3e2 -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-205/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-205/model-00006-of-00007.safetensors deleted file mode 100644 index 2cc9c7be92db2736fbfcb09a01ba5b2b02cfaa07..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7eac303527440c5e798588d9dcf806249850f726e9a504da11dadb3e82e5a4f -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-205/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-205/model-00007-of-00007.safetensors deleted file mode 100644 index 7597bd763d0ac2a438faf65753b96895772b6dc6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2630bdfdd05a362976074e4d1ffda9c318165d464c91a2930e3cbdfff3f6d1ba -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-205/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-205/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-205/rng_state_0.pth b/metallama3_8b/limo/checkpoint-205/rng_state_0.pth deleted file mode 100644 index 9c287de26f76b389db025ad109f0595b0b77fd22..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92cc13315f24c28015d695b6cde08bb1cd6fea4cbc435998485ed6fbe4c91285 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-205/rng_state_1.pth b/metallama3_8b/limo/checkpoint-205/rng_state_1.pth deleted file mode 100644 index 132db267a0f5617620f48bc8eab9cc37a9aea13a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4c154b6a63e0b1f98f7d2847944398f99f1657d35e8eddf7fdf0ae2c24b0552 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-205/rng_state_2.pth b/metallama3_8b/limo/checkpoint-205/rng_state_2.pth deleted file mode 100644 index e85bf2eceab47cefd59df592648941c61c84eab1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f784c6a9507b51189f2caffbd178ea9882103b75852e31c15f47fdae6a43af1d -size 15024 diff --git a/metallama3_8b/limo/checkpoint-205/rng_state_3.pth b/metallama3_8b/limo/checkpoint-205/rng_state_3.pth deleted file mode 100644 index 423bb6c008eeb6875c659dd108c5f003758dbcb9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34b023e05bc2d12b91dc436d4922b990d50ec8dc56d40dc3e36b3bb34fc81341 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-205/scheduler.pt b/metallama3_8b/limo/checkpoint-205/scheduler.pt deleted file mode 100644 index 50295f2d5e326b1873fa122f57c3f3448feaa575..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a16163f34db040e2b0bd4461bd027c667560718df79714f98787d5971da9120b -size 1064 diff --git a/metallama3_8b/limo/checkpoint-205/special_tokens_map.json b/metallama3_8b/limo/checkpoint-205/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-205/tokenizer.json b/metallama3_8b/limo/checkpoint-205/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-205/tokenizer_config.json b/metallama3_8b/limo/checkpoint-205/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-205/trainer_state.json b/metallama3_8b/limo/checkpoint-205/trainer_state.json deleted file mode 100644 index 1f9b2b58412005e224aefc3e6d846470d771c0e0..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-205/trainer_state.json +++ /dev/null @@ -1,1469 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 205, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 5.897858756221338e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-2050/chat_template.jinja b/metallama3_8b/limo/checkpoint-2050/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-2050/config.json b/metallama3_8b/limo/checkpoint-2050/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-2050/generation_config.json b/metallama3_8b/limo/checkpoint-2050/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-2050/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-2050/model-00001-of-00007.safetensors deleted file mode 100644 index 4d5a84d922df222c932eac589c15ea3357de75fe..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:225412b1113ad13202cf4125509a242705ca9fc3d2662195a561bd8e48fd4db8 -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-2050/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-2050/model-00002-of-00007.safetensors deleted file mode 100644 index 1c192eeae2946f3f0c35b0a90bf16005c749f19d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14e92e96e88dfee47e5a50580659bc951b35026bf2889984fa38d1923c957533 -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-2050/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-2050/model-00003-of-00007.safetensors deleted file mode 100644 index f8e19a4384cdbd25ddec25cf62d2429514cccbd3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00617c2642eaec9556936462f59084becdfd958740be2fff056bc36201ae2c12 -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-2050/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-2050/model-00004-of-00007.safetensors deleted file mode 100644 index 61e62a8f5bafd115a90a0dba23a535050307bde6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec16549b532ec841a3b942995c41e00a57fa5ca1c365037d63ac498736687686 -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-2050/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-2050/model-00005-of-00007.safetensors deleted file mode 100644 index a98f8be8e21c31b18c87631a894ac368a70b96e8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cc4772012f82ad7210046779c85e46d634c9fd6cd53880731c9606da82dd82 -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-2050/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-2050/model-00006-of-00007.safetensors deleted file mode 100644 index a7fae9a03ca054ca2b90b9983337c88ecb5200a2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcfdf27103fdd59e237428fc57d57a5169f89c221728466484129a6537745c05 -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-2050/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-2050/model-00007-of-00007.safetensors deleted file mode 100644 index c6674238c889867f82322311f8d2604e1025fd39..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7d9d40d45c5af709cc2a0dbdf405837ae0742d132527467ef97ab7210d5ffeff -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-2050/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-2050/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-2050/rng_state_0.pth b/metallama3_8b/limo/checkpoint-2050/rng_state_0.pth deleted file mode 100644 index 0c73dd943c40497990387f5f3dacb08ddd27a929..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7dbc6521b0b64cb12d818506108fcf257a4089ca8a9b1e453776ed3e032e7176 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-2050/rng_state_1.pth b/metallama3_8b/limo/checkpoint-2050/rng_state_1.pth deleted file mode 100644 index f57618444fac32f854b52c01ec2e258a65bc4d96..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b13e3da1b0679cab1bab94f893e385a9a224d3335b5a6f62602f33c2be88d03 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-2050/rng_state_2.pth b/metallama3_8b/limo/checkpoint-2050/rng_state_2.pth deleted file mode 100644 index fb07e7b31bd4b60cb7c279157d2ffa4f268cb36a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6a24f0e0f117b5a8236e0d12594c0c358f41ef00068d4460002e95ad1cc3cb1c -size 15024 diff --git a/metallama3_8b/limo/checkpoint-2050/rng_state_3.pth b/metallama3_8b/limo/checkpoint-2050/rng_state_3.pth deleted file mode 100644 index 87c3a97e34da9032aeacc1dafd124d92425dabdd..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e46e4eab6c4a25d84ad36ddf1357401788adeeb6388c03cefa35a63b52ee7610 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-2050/scheduler.pt b/metallama3_8b/limo/checkpoint-2050/scheduler.pt deleted file mode 100644 index b33ba118b3d4dbe4e67c3dafc5d67af3c0024443..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:74173b9069cd21cf11a7f4a11dd8d923de64a413f319511e932793e4cbe5533f -size 1064 diff --git a/metallama3_8b/limo/checkpoint-2050/special_tokens_map.json b/metallama3_8b/limo/checkpoint-2050/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-2050/tokenizer.json b/metallama3_8b/limo/checkpoint-2050/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-2050/tokenizer_config.json b/metallama3_8b/limo/checkpoint-2050/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-2050/trainer_state.json b/metallama3_8b/limo/checkpoint-2050/trainer_state.json deleted file mode 100644 index 798cc074b7ffb479000459b99bd124a4af728811..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-2050/trainer_state.json +++ /dev/null @@ -1,14384 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 10.0, - "eval_steps": 500, - "global_step": 2050, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - }, - { - "epoch": 3.004878048780488, - "grad_norm": 2.787429094314575, - "learning_rate": 3.969463130731183e-06, - "loss": 0.2403, - "step": 616 - }, - { - "epoch": 3.0097560975609756, - "grad_norm": 3.0412075519561768, - "learning_rate": 3.966361891975316e-06, - "loss": 0.2635, - "step": 617 - }, - { - "epoch": 3.0146341463414634, - "grad_norm": 2.9949851036071777, - "learning_rate": 3.963257209457652e-06, - "loss": 0.3294, - "step": 618 - }, - { - "epoch": 3.0195121951219512, - "grad_norm": 3.0510809421539307, - "learning_rate": 3.960149090469561e-06, - "loss": 0.1338, - "step": 619 - }, - { - "epoch": 3.024390243902439, - "grad_norm": 3.669482707977295, - "learning_rate": 3.957037542310484e-06, - "loss": 0.1469, - "step": 620 - }, - { - "epoch": 3.029268292682927, - "grad_norm": 4.677116870880127, - "learning_rate": 3.953922572287915e-06, - "loss": 0.2788, - "step": 621 - }, - { - "epoch": 3.0341463414634147, - "grad_norm": 4.33144474029541, - "learning_rate": 3.950804187717384e-06, - "loss": 0.4521, - "step": 622 - }, - { - "epoch": 3.0390243902439025, - "grad_norm": 3.466639757156372, - "learning_rate": 3.947682395922439e-06, - "loss": 0.5113, - "step": 623 - }, - { - "epoch": 3.0439024390243903, - "grad_norm": 3.2332122325897217, - "learning_rate": 3.9445572042346346e-06, - "loss": 0.0968, - "step": 624 - }, - { - "epoch": 3.048780487804878, - "grad_norm": 2.6108055114746094, - "learning_rate": 3.941428619993505e-06, - "loss": 0.2462, - "step": 625 - }, - { - "epoch": 3.053658536585366, - "grad_norm": 3.2512595653533936, - "learning_rate": 3.938296650546552e-06, - "loss": 0.1782, - "step": 626 - }, - { - "epoch": 3.0585365853658537, - "grad_norm": 3.4350366592407227, - "learning_rate": 3.935161303249231e-06, - "loss": 0.2955, - "step": 627 - }, - { - "epoch": 3.0634146341463415, - "grad_norm": 3.42012619972229, - "learning_rate": 3.932022585464928e-06, - "loss": 0.3259, - "step": 628 - }, - { - "epoch": 3.0682926829268293, - "grad_norm": 3.458043336868286, - "learning_rate": 3.928880504564943e-06, - "loss": 0.2306, - "step": 629 - }, - { - "epoch": 3.073170731707317, - "grad_norm": 2.646616220474243, - "learning_rate": 3.92573506792848e-06, - "loss": 0.2197, - "step": 630 - }, - { - "epoch": 3.078048780487805, - "grad_norm": 3.5558857917785645, - "learning_rate": 3.9225862829426184e-06, - "loss": 0.1607, - "step": 631 - }, - { - "epoch": 3.0829268292682928, - "grad_norm": 3.6011338233947754, - "learning_rate": 3.919434157002303e-06, - "loss": 0.3087, - "step": 632 - }, - { - "epoch": 3.0878048780487806, - "grad_norm": 2.339879035949707, - "learning_rate": 3.916278697510325e-06, - "loss": 0.2213, - "step": 633 - }, - { - "epoch": 3.0926829268292684, - "grad_norm": 3.268162488937378, - "learning_rate": 3.913119911877305e-06, - "loss": 0.318, - "step": 634 - }, - { - "epoch": 3.097560975609756, - "grad_norm": 4.062571048736572, - "learning_rate": 3.909957807521674e-06, - "loss": 0.1757, - "step": 635 - }, - { - "epoch": 3.102439024390244, - "grad_norm": 2.997659683227539, - "learning_rate": 3.906792391869657e-06, - "loss": 0.2391, - "step": 636 - }, - { - "epoch": 3.107317073170732, - "grad_norm": 3.7037394046783447, - "learning_rate": 3.903623672355258e-06, - "loss": 0.2548, - "step": 637 - }, - { - "epoch": 3.1121951219512196, - "grad_norm": 3.110579252243042, - "learning_rate": 3.900451656420237e-06, - "loss": 0.2389, - "step": 638 - }, - { - "epoch": 3.1170731707317074, - "grad_norm": 3.3332321643829346, - "learning_rate": 3.897276351514097e-06, - "loss": 0.1371, - "step": 639 - }, - { - "epoch": 3.1219512195121952, - "grad_norm": 3.8275935649871826, - "learning_rate": 3.894097765094065e-06, - "loss": 0.3363, - "step": 640 - }, - { - "epoch": 3.126829268292683, - "grad_norm": 2.3731374740600586, - "learning_rate": 3.890915904625075e-06, - "loss": 0.1314, - "step": 641 - }, - { - "epoch": 3.131707317073171, - "grad_norm": 3.1511282920837402, - "learning_rate": 3.887730777579751e-06, - "loss": 0.3563, - "step": 642 - }, - { - "epoch": 3.1365853658536587, - "grad_norm": 4.2254862785339355, - "learning_rate": 3.884542391438387e-06, - "loss": 0.5053, - "step": 643 - }, - { - "epoch": 3.1414634146341465, - "grad_norm": 4.579670429229736, - "learning_rate": 3.88135075368893e-06, - "loss": 0.6259, - "step": 644 - }, - { - "epoch": 3.1463414634146343, - "grad_norm": 3.2102746963500977, - "learning_rate": 3.878155871826968e-06, - "loss": 0.2599, - "step": 645 - }, - { - "epoch": 3.151219512195122, - "grad_norm": 2.5569686889648438, - "learning_rate": 3.874957753355701e-06, - "loss": 0.2075, - "step": 646 - }, - { - "epoch": 3.15609756097561, - "grad_norm": 3.588925838470459, - "learning_rate": 3.8717564057859365e-06, - "loss": 0.4577, - "step": 647 - }, - { - "epoch": 3.1609756097560977, - "grad_norm": 3.6163878440856934, - "learning_rate": 3.868551836636063e-06, - "loss": 0.4023, - "step": 648 - }, - { - "epoch": 3.1658536585365855, - "grad_norm": 3.8688390254974365, - "learning_rate": 3.865344053432035e-06, - "loss": 0.1669, - "step": 649 - }, - { - "epoch": 3.1707317073170733, - "grad_norm": 3.419734001159668, - "learning_rate": 3.862133063707353e-06, - "loss": 0.2766, - "step": 650 - }, - { - "epoch": 3.175609756097561, - "grad_norm": 2.9860243797302246, - "learning_rate": 3.858918875003053e-06, - "loss": 0.1788, - "step": 651 - }, - { - "epoch": 3.180487804878049, - "grad_norm": 3.0619022846221924, - "learning_rate": 3.855701494867679e-06, - "loss": 0.224, - "step": 652 - }, - { - "epoch": 3.1853658536585368, - "grad_norm": 3.3668978214263916, - "learning_rate": 3.852480930857275e-06, - "loss": 0.4029, - "step": 653 - }, - { - "epoch": 3.1902439024390246, - "grad_norm": 3.543147563934326, - "learning_rate": 3.849257190535356e-06, - "loss": 0.2096, - "step": 654 - }, - { - "epoch": 3.1951219512195124, - "grad_norm": 3.793619155883789, - "learning_rate": 3.846030281472902e-06, - "loss": 0.5574, - "step": 655 - }, - { - "epoch": 3.2, - "grad_norm": 3.021289110183716, - "learning_rate": 3.842800211248333e-06, - "loss": 0.2233, - "step": 656 - }, - { - "epoch": 3.204878048780488, - "grad_norm": 4.582934856414795, - "learning_rate": 3.839566987447492e-06, - "loss": 0.3871, - "step": 657 - }, - { - "epoch": 3.209756097560976, - "grad_norm": 2.996340274810791, - "learning_rate": 3.8363306176636296e-06, - "loss": 0.4325, - "step": 658 - }, - { - "epoch": 3.2146341463414636, - "grad_norm": 3.3190877437591553, - "learning_rate": 3.833091109497384e-06, - "loss": 0.5321, - "step": 659 - }, - { - "epoch": 3.2195121951219514, - "grad_norm": 3.2532856464385986, - "learning_rate": 3.829848470556765e-06, - "loss": 0.1359, - "step": 660 - }, - { - "epoch": 3.2243902439024392, - "grad_norm": 2.7875044345855713, - "learning_rate": 3.8266027084571335e-06, - "loss": 0.3145, - "step": 661 - }, - { - "epoch": 3.229268292682927, - "grad_norm": 3.748253583908081, - "learning_rate": 3.823353830821187e-06, - "loss": 0.1252, - "step": 662 - }, - { - "epoch": 3.234146341463415, - "grad_norm": 2.858293294906616, - "learning_rate": 3.820101845278937e-06, - "loss": 0.2589, - "step": 663 - }, - { - "epoch": 3.2390243902439027, - "grad_norm": 3.7470967769622803, - "learning_rate": 3.816846759467696e-06, - "loss": 0.2594, - "step": 664 - }, - { - "epoch": 3.2439024390243905, - "grad_norm": 3.676196813583374, - "learning_rate": 3.8135885810320587e-06, - "loss": 0.2998, - "step": 665 - }, - { - "epoch": 3.2487804878048783, - "grad_norm": 3.0943140983581543, - "learning_rate": 3.810327317623881e-06, - "loss": 0.2238, - "step": 666 - }, - { - "epoch": 3.253658536585366, - "grad_norm": 3.5907349586486816, - "learning_rate": 3.8070629769022628e-06, - "loss": 0.3381, - "step": 667 - }, - { - "epoch": 3.258536585365854, - "grad_norm": 3.1195285320281982, - "learning_rate": 3.8037955665335335e-06, - "loss": 0.2407, - "step": 668 - }, - { - "epoch": 3.2634146341463417, - "grad_norm": 3.422292947769165, - "learning_rate": 3.800525094191231e-06, - "loss": 0.2957, - "step": 669 - }, - { - "epoch": 3.2682926829268295, - "grad_norm": 2.5264663696289062, - "learning_rate": 3.797251567556083e-06, - "loss": 0.2493, - "step": 670 - }, - { - "epoch": 3.2731707317073173, - "grad_norm": 3.350219964981079, - "learning_rate": 3.793974994315991e-06, - "loss": 0.1186, - "step": 671 - }, - { - "epoch": 3.278048780487805, - "grad_norm": 4.175906181335449, - "learning_rate": 3.790695382166013e-06, - "loss": 0.3453, - "step": 672 - }, - { - "epoch": 3.2829268292682925, - "grad_norm": 3.006072521209717, - "learning_rate": 3.7874127388083415e-06, - "loss": 0.1981, - "step": 673 - }, - { - "epoch": 3.2878048780487803, - "grad_norm": 3.368561029434204, - "learning_rate": 3.7841270719522895e-06, - "loss": 0.2934, - "step": 674 - }, - { - "epoch": 3.292682926829268, - "grad_norm": 4.374331951141357, - "learning_rate": 3.7808383893142692e-06, - "loss": 0.1359, - "step": 675 - }, - { - "epoch": 3.297560975609756, - "grad_norm": 3.297102451324463, - "learning_rate": 3.7775466986177763e-06, - "loss": 0.2498, - "step": 676 - }, - { - "epoch": 3.3024390243902437, - "grad_norm": 2.8914761543273926, - "learning_rate": 3.774252007593371e-06, - "loss": 0.1308, - "step": 677 - }, - { - "epoch": 3.3073170731707315, - "grad_norm": 3.1550722122192383, - "learning_rate": 3.7709543239786593e-06, - "loss": 0.3915, - "step": 678 - }, - { - "epoch": 3.3121951219512193, - "grad_norm": 3.2302658557891846, - "learning_rate": 3.767653655518277e-06, - "loss": 0.2558, - "step": 679 - }, - { - "epoch": 3.317073170731707, - "grad_norm": 4.4321770668029785, - "learning_rate": 3.7643500099638673e-06, - "loss": 0.1988, - "step": 680 - }, - { - "epoch": 3.321951219512195, - "grad_norm": 2.970566749572754, - "learning_rate": 3.7610433950740667e-06, - "loss": 0.4908, - "step": 681 - }, - { - "epoch": 3.3268292682926828, - "grad_norm": 3.5516228675842285, - "learning_rate": 3.757733818614485e-06, - "loss": 0.304, - "step": 682 - }, - { - "epoch": 3.3317073170731706, - "grad_norm": 2.7555387020111084, - "learning_rate": 3.7544212883576856e-06, - "loss": 0.2533, - "step": 683 - }, - { - "epoch": 3.3365853658536584, - "grad_norm": 3.61226749420166, - "learning_rate": 3.751105812083172e-06, - "loss": 0.1771, - "step": 684 - }, - { - "epoch": 3.341463414634146, - "grad_norm": 3.0466206073760986, - "learning_rate": 3.7477873975773655e-06, - "loss": 0.4213, - "step": 685 - }, - { - "epoch": 3.346341463414634, - "grad_norm": 3.6091527938842773, - "learning_rate": 3.7444660526335853e-06, - "loss": 0.3808, - "step": 686 - }, - { - "epoch": 3.351219512195122, - "grad_norm": 3.8443002700805664, - "learning_rate": 3.741141785052036e-06, - "loss": 0.6438, - "step": 687 - }, - { - "epoch": 3.3560975609756096, - "grad_norm": 3.845909833908081, - "learning_rate": 3.737814602639784e-06, - "loss": 0.3686, - "step": 688 - }, - { - "epoch": 3.3609756097560974, - "grad_norm": 2.904892921447754, - "learning_rate": 3.7344845132107427e-06, - "loss": 0.2934, - "step": 689 - }, - { - "epoch": 3.3658536585365852, - "grad_norm": 3.4766387939453125, - "learning_rate": 3.731151524585651e-06, - "loss": 0.3299, - "step": 690 - }, - { - "epoch": 3.370731707317073, - "grad_norm": 4.236767768859863, - "learning_rate": 3.7278156445920584e-06, - "loss": 0.6303, - "step": 691 - }, - { - "epoch": 3.375609756097561, - "grad_norm": 3.1122591495513916, - "learning_rate": 3.724476881064303e-06, - "loss": 0.2432, - "step": 692 - }, - { - "epoch": 3.3804878048780487, - "grad_norm": 3.0971457958221436, - "learning_rate": 3.721135241843496e-06, - "loss": 0.3131, - "step": 693 - }, - { - "epoch": 3.3853658536585365, - "grad_norm": 3.9365804195404053, - "learning_rate": 3.7177907347775016e-06, - "loss": 0.3372, - "step": 694 - }, - { - "epoch": 3.3902439024390243, - "grad_norm": 3.760373115539551, - "learning_rate": 3.71444336772092e-06, - "loss": 0.5055, - "step": 695 - }, - { - "epoch": 3.395121951219512, - "grad_norm": 4.360848426818848, - "learning_rate": 3.711093148535068e-06, - "loss": 0.6183, - "step": 696 - }, - { - "epoch": 3.4, - "grad_norm": 3.7713537216186523, - "learning_rate": 3.707740085087959e-06, - "loss": 0.1568, - "step": 697 - }, - { - "epoch": 3.4048780487804877, - "grad_norm": 3.8532230854034424, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.2826, - "step": 698 - }, - { - "epoch": 3.4097560975609755, - "grad_norm": 3.0548605918884277, - "learning_rate": 3.701025456915411e-06, - "loss": 0.1918, - "step": 699 - }, - { - "epoch": 3.4146341463414633, - "grad_norm": 3.2431821823120117, - "learning_rate": 3.697663907959327e-06, - "loss": 0.2493, - "step": 700 - }, - { - "epoch": 3.419512195121951, - "grad_norm": 3.7301864624023438, - "learning_rate": 3.6942995462806574e-06, - "loss": 0.4913, - "step": 701 - }, - { - "epoch": 3.424390243902439, - "grad_norm": 2.5468900203704834, - "learning_rate": 3.6909323797806314e-06, - "loss": 0.1788, - "step": 702 - }, - { - "epoch": 3.4292682926829268, - "grad_norm": 3.3719515800476074, - "learning_rate": 3.6875624163670635e-06, - "loss": 0.4162, - "step": 703 - }, - { - "epoch": 3.4341463414634146, - "grad_norm": 3.528010368347168, - "learning_rate": 3.6841896639543394e-06, - "loss": 0.1924, - "step": 704 - }, - { - "epoch": 3.4390243902439024, - "grad_norm": 3.3636631965637207, - "learning_rate": 3.6808141304633924e-06, - "loss": 0.3177, - "step": 705 - }, - { - "epoch": 3.44390243902439, - "grad_norm": 3.418705463409424, - "learning_rate": 3.6774358238216878e-06, - "loss": 0.2301, - "step": 706 - }, - { - "epoch": 3.448780487804878, - "grad_norm": 4.720373630523682, - "learning_rate": 3.6740547519632048e-06, - "loss": 0.1894, - "step": 707 - }, - { - "epoch": 3.453658536585366, - "grad_norm": 2.9635703563690186, - "learning_rate": 3.670670922828414e-06, - "loss": 0.2642, - "step": 708 - }, - { - "epoch": 3.4585365853658536, - "grad_norm": 4.934754371643066, - "learning_rate": 3.667284344364264e-06, - "loss": 0.2275, - "step": 709 - }, - { - "epoch": 3.4634146341463414, - "grad_norm": 3.090585231781006, - "learning_rate": 3.6638950245241604e-06, - "loss": 0.4447, - "step": 710 - }, - { - "epoch": 3.4682926829268292, - "grad_norm": 4.360495090484619, - "learning_rate": 3.660502971267945e-06, - "loss": 0.2415, - "step": 711 - }, - { - "epoch": 3.473170731707317, - "grad_norm": 3.4893476963043213, - "learning_rate": 3.65710819256188e-06, - "loss": 0.0921, - "step": 712 - }, - { - "epoch": 3.478048780487805, - "grad_norm": 3.2423770427703857, - "learning_rate": 3.65371069637863e-06, - "loss": 0.2371, - "step": 713 - }, - { - "epoch": 3.4829268292682927, - "grad_norm": 3.0775890350341797, - "learning_rate": 3.650310490697238e-06, - "loss": 0.4026, - "step": 714 - }, - { - "epoch": 3.4878048780487805, - "grad_norm": 3.906625270843506, - "learning_rate": 3.646907583503114e-06, - "loss": 0.4312, - "step": 715 - }, - { - "epoch": 3.4926829268292683, - "grad_norm": 3.2140414714813232, - "learning_rate": 3.6435019827880093e-06, - "loss": 0.2309, - "step": 716 - }, - { - "epoch": 3.497560975609756, - "grad_norm": 3.048523426055908, - "learning_rate": 3.640093696550003e-06, - "loss": 0.296, - "step": 717 - }, - { - "epoch": 3.502439024390244, - "grad_norm": 2.9669039249420166, - "learning_rate": 3.6366827327934817e-06, - "loss": 0.2723, - "step": 718 - }, - { - "epoch": 3.5073170731707317, - "grad_norm": 3.6941726207733154, - "learning_rate": 3.6332690995291176e-06, - "loss": 0.3797, - "step": 719 - }, - { - "epoch": 3.5121951219512195, - "grad_norm": 5.135766506195068, - "learning_rate": 3.6298528047738545e-06, - "loss": 0.9868, - "step": 720 - }, - { - "epoch": 3.5170731707317073, - "grad_norm": 3.2021052837371826, - "learning_rate": 3.626433856550886e-06, - "loss": 0.4069, - "step": 721 - }, - { - "epoch": 3.521951219512195, - "grad_norm": 3.094444513320923, - "learning_rate": 3.623012262889637e-06, - "loss": 0.3368, - "step": 722 - }, - { - "epoch": 3.526829268292683, - "grad_norm": 3.609285354614258, - "learning_rate": 3.6195880318257465e-06, - "loss": 0.3972, - "step": 723 - }, - { - "epoch": 3.5317073170731708, - "grad_norm": 4.236501216888428, - "learning_rate": 3.616161171401046e-06, - "loss": 0.52, - "step": 724 - }, - { - "epoch": 3.5365853658536586, - "grad_norm": 3.504526376724243, - "learning_rate": 3.612731689663542e-06, - "loss": 0.23, - "step": 725 - }, - { - "epoch": 3.5414634146341464, - "grad_norm": 3.233591079711914, - "learning_rate": 3.6092995946673996e-06, - "loss": 0.4151, - "step": 726 - }, - { - "epoch": 3.546341463414634, - "grad_norm": 3.6701886653900146, - "learning_rate": 3.605864894472918e-06, - "loss": 0.2798, - "step": 727 - }, - { - "epoch": 3.551219512195122, - "grad_norm": 3.8713181018829346, - "learning_rate": 3.602427597146516e-06, - "loss": 0.4336, - "step": 728 - }, - { - "epoch": 3.55609756097561, - "grad_norm": 5.49612283706665, - "learning_rate": 3.5989877107607134e-06, - "loss": 0.4803, - "step": 729 - }, - { - "epoch": 3.5609756097560976, - "grad_norm": 3.771005392074585, - "learning_rate": 3.5955452433941075e-06, - "loss": 0.3698, - "step": 730 - }, - { - "epoch": 3.5658536585365854, - "grad_norm": 2.970822334289551, - "learning_rate": 3.5921002031313586e-06, - "loss": 0.2373, - "step": 731 - }, - { - "epoch": 3.5707317073170732, - "grad_norm": 3.517249584197998, - "learning_rate": 3.58865259806317e-06, - "loss": 0.1908, - "step": 732 - }, - { - "epoch": 3.575609756097561, - "grad_norm": 3.6825428009033203, - "learning_rate": 3.585202436286267e-06, - "loss": 0.3993, - "step": 733 - }, - { - "epoch": 3.580487804878049, - "grad_norm": 3.387479066848755, - "learning_rate": 3.581749725903381e-06, - "loss": 0.4237, - "step": 734 - }, - { - "epoch": 3.5853658536585367, - "grad_norm": 3.5004806518554688, - "learning_rate": 3.5782944750232274e-06, - "loss": 0.3011, - "step": 735 - }, - { - "epoch": 3.5902439024390245, - "grad_norm": 3.461731433868408, - "learning_rate": 3.574836691760489e-06, - "loss": 0.0896, - "step": 736 - }, - { - "epoch": 3.5951219512195123, - "grad_norm": 3.9598381519317627, - "learning_rate": 3.571376384235795e-06, - "loss": 0.2751, - "step": 737 - }, - { - "epoch": 3.6, - "grad_norm": 4.053933143615723, - "learning_rate": 3.5679135605757035e-06, - "loss": 0.2086, - "step": 738 - }, - { - "epoch": 3.604878048780488, - "grad_norm": 2.9683544635772705, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1659, - "step": 739 - }, - { - "epoch": 3.6097560975609757, - "grad_norm": 3.6598448753356934, - "learning_rate": 3.5609803973850877e-06, - "loss": 0.2469, - "step": 740 - }, - { - "epoch": 3.6146341463414635, - "grad_norm": 3.449335813522339, - "learning_rate": 3.557510074137147e-06, - "loss": 0.375, - "step": 741 - }, - { - "epoch": 3.6195121951219513, - "grad_norm": 2.7666923999786377, - "learning_rate": 3.554037267318942e-06, - "loss": 0.3133, - "step": 742 - }, - { - "epoch": 3.624390243902439, - "grad_norm": 2.8951869010925293, - "learning_rate": 3.5505619850863847e-06, - "loss": 0.2243, - "step": 743 - }, - { - "epoch": 3.629268292682927, - "grad_norm": 3.477747678756714, - "learning_rate": 3.5470842356012007e-06, - "loss": 0.1321, - "step": 744 - }, - { - "epoch": 3.6341463414634148, - "grad_norm": 3.810480833053589, - "learning_rate": 3.5436040270309113e-06, - "loss": 0.361, - "step": 745 - }, - { - "epoch": 3.6390243902439026, - "grad_norm": 3.0730793476104736, - "learning_rate": 3.540121367548811e-06, - "loss": 0.1523, - "step": 746 - }, - { - "epoch": 3.6439024390243904, - "grad_norm": 3.6878390312194824, - "learning_rate": 3.5366362653339524e-06, - "loss": 0.4898, - "step": 747 - }, - { - "epoch": 3.648780487804878, - "grad_norm": 3.6432242393493652, - "learning_rate": 3.533148728571124e-06, - "loss": 0.1397, - "step": 748 - }, - { - "epoch": 3.653658536585366, - "grad_norm": 3.7047760486602783, - "learning_rate": 3.5296587654508317e-06, - "loss": 0.323, - "step": 749 - }, - { - "epoch": 3.658536585365854, - "grad_norm": 3.777132749557495, - "learning_rate": 3.526166384169279e-06, - "loss": 0.5577, - "step": 750 - }, - { - "epoch": 3.6634146341463416, - "grad_norm": 3.7970924377441406, - "learning_rate": 3.5226715929283507e-06, - "loss": 0.245, - "step": 751 - }, - { - "epoch": 3.6682926829268294, - "grad_norm": 2.8203537464141846, - "learning_rate": 3.519174399935588e-06, - "loss": 0.1619, - "step": 752 - }, - { - "epoch": 3.6731707317073172, - "grad_norm": 3.4040987491607666, - "learning_rate": 3.5156748134041767e-06, - "loss": 0.1047, - "step": 753 - }, - { - "epoch": 3.678048780487805, - "grad_norm": 3.927960157394409, - "learning_rate": 3.5121728415529203e-06, - "loss": 0.5713, - "step": 754 - }, - { - "epoch": 3.682926829268293, - "grad_norm": 3.3833277225494385, - "learning_rate": 3.5086684926062266e-06, - "loss": 0.2174, - "step": 755 - }, - { - "epoch": 3.68780487804878, - "grad_norm": 3.989307403564453, - "learning_rate": 3.505161774794085e-06, - "loss": 0.285, - "step": 756 - }, - { - "epoch": 3.692682926829268, - "grad_norm": 2.742429494857788, - "learning_rate": 3.5016526963520474e-06, - "loss": 0.1602, - "step": 757 - }, - { - "epoch": 3.697560975609756, - "grad_norm": 3.7082698345184326, - "learning_rate": 3.498141265521212e-06, - "loss": 0.666, - "step": 758 - }, - { - "epoch": 3.7024390243902436, - "grad_norm": 3.033196210861206, - "learning_rate": 3.4946274905481997e-06, - "loss": 0.2024, - "step": 759 - }, - { - "epoch": 3.7073170731707314, - "grad_norm": 3.7145371437072754, - "learning_rate": 3.4911113796851364e-06, - "loss": 0.2719, - "step": 760 - }, - { - "epoch": 3.7121951219512193, - "grad_norm": 3.580298900604248, - "learning_rate": 3.487592941189636e-06, - "loss": 0.1537, - "step": 761 - }, - { - "epoch": 3.717073170731707, - "grad_norm": 4.753757953643799, - "learning_rate": 3.484072183324776e-06, - "loss": 0.6149, - "step": 762 - }, - { - "epoch": 3.721951219512195, - "grad_norm": 3.5575687885284424, - "learning_rate": 3.4805491143590823e-06, - "loss": 0.4241, - "step": 763 - }, - { - "epoch": 3.7268292682926827, - "grad_norm": 3.215224266052246, - "learning_rate": 3.4770237425665103e-06, - "loss": 0.3037, - "step": 764 - }, - { - "epoch": 3.7317073170731705, - "grad_norm": 2.9899685382843018, - "learning_rate": 3.4734960762264204e-06, - "loss": 0.4854, - "step": 765 - }, - { - "epoch": 3.7365853658536583, - "grad_norm": 3.5880227088928223, - "learning_rate": 3.469966123623563e-06, - "loss": 0.3849, - "step": 766 - }, - { - "epoch": 3.741463414634146, - "grad_norm": 3.472750186920166, - "learning_rate": 3.46643389304806e-06, - "loss": 0.3159, - "step": 767 - }, - { - "epoch": 3.746341463414634, - "grad_norm": 4.355650901794434, - "learning_rate": 3.4628993927953786e-06, - "loss": 0.7527, - "step": 768 - }, - { - "epoch": 3.7512195121951217, - "grad_norm": 2.94575834274292, - "learning_rate": 3.45936263116632e-06, - "loss": 0.1716, - "step": 769 - }, - { - "epoch": 3.7560975609756095, - "grad_norm": 2.991525173187256, - "learning_rate": 3.4558236164669957e-06, - "loss": 0.2061, - "step": 770 - }, - { - "epoch": 3.7609756097560973, - "grad_norm": 3.134000301361084, - "learning_rate": 3.4522823570088073e-06, - "loss": 0.1338, - "step": 771 - }, - { - "epoch": 3.765853658536585, - "grad_norm": 3.722140312194824, - "learning_rate": 3.4487388611084295e-06, - "loss": 0.2615, - "step": 772 - }, - { - "epoch": 3.770731707317073, - "grad_norm": 3.7941153049468994, - "learning_rate": 3.445193137087788e-06, - "loss": 0.1401, - "step": 773 - }, - { - "epoch": 3.7756097560975608, - "grad_norm": 2.872941732406616, - "learning_rate": 3.4416451932740424e-06, - "loss": 0.2934, - "step": 774 - }, - { - "epoch": 3.7804878048780486, - "grad_norm": 4.5019941329956055, - "learning_rate": 3.4380950379995652e-06, - "loss": 0.4579, - "step": 775 - }, - { - "epoch": 3.7853658536585364, - "grad_norm": 2.682884931564331, - "learning_rate": 3.434542679601922e-06, - "loss": 0.2979, - "step": 776 - }, - { - "epoch": 3.790243902439024, - "grad_norm": 3.3044273853302, - "learning_rate": 3.4309881264238538e-06, - "loss": 0.1196, - "step": 777 - }, - { - "epoch": 3.795121951219512, - "grad_norm": 3.102760076522827, - "learning_rate": 3.4274313868132547e-06, - "loss": 0.2026, - "step": 778 - }, - { - "epoch": 3.8, - "grad_norm": 3.3304500579833984, - "learning_rate": 3.4238724691231534e-06, - "loss": 0.2135, - "step": 779 - }, - { - "epoch": 3.8048780487804876, - "grad_norm": 3.295119047164917, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.4418, - "step": 780 - }, - { - "epoch": 3.8097560975609754, - "grad_norm": 3.6655640602111816, - "learning_rate": 3.4167481329421204e-06, - "loss": 0.203, - "step": 781 - }, - { - "epoch": 3.8146341463414632, - "grad_norm": 3.387830972671509, - "learning_rate": 3.4131827311827447e-06, - "loss": 0.3225, - "step": 782 - }, - { - "epoch": 3.819512195121951, - "grad_norm": 2.621633529663086, - "learning_rate": 3.4096151848069416e-06, - "loss": 0.1704, - "step": 783 - }, - { - "epoch": 3.824390243902439, - "grad_norm": 2.974344491958618, - "learning_rate": 3.4060455021931195e-06, - "loss": 0.2785, - "step": 784 - }, - { - "epoch": 3.8292682926829267, - "grad_norm": 3.452131748199463, - "learning_rate": 3.402473691724704e-06, - "loss": 0.223, - "step": 785 - }, - { - "epoch": 3.8341463414634145, - "grad_norm": 2.6373705863952637, - "learning_rate": 3.39889976179012e-06, - "loss": 0.2368, - "step": 786 - }, - { - "epoch": 3.8390243902439023, - "grad_norm": 2.863184928894043, - "learning_rate": 3.3953237207827673e-06, - "loss": 0.3294, - "step": 787 - }, - { - "epoch": 3.84390243902439, - "grad_norm": 5.104704856872559, - "learning_rate": 3.391745577101005e-06, - "loss": 0.5431, - "step": 788 - }, - { - "epoch": 3.848780487804878, - "grad_norm": 3.951310634613037, - "learning_rate": 3.3881653391481306e-06, - "loss": 0.2546, - "step": 789 - }, - { - "epoch": 3.8536585365853657, - "grad_norm": 3.9903225898742676, - "learning_rate": 3.384583015332359e-06, - "loss": 0.3293, - "step": 790 - }, - { - "epoch": 3.8585365853658535, - "grad_norm": 3.3149220943450928, - "learning_rate": 3.380998614066805e-06, - "loss": 0.1861, - "step": 791 - }, - { - "epoch": 3.8634146341463413, - "grad_norm": 3.6755223274230957, - "learning_rate": 3.3774121437694606e-06, - "loss": 0.2498, - "step": 792 - }, - { - "epoch": 3.868292682926829, - "grad_norm": 3.192918300628662, - "learning_rate": 3.3738236128631786e-06, - "loss": 0.1525, - "step": 793 - }, - { - "epoch": 3.873170731707317, - "grad_norm": 3.5358777046203613, - "learning_rate": 3.3702330297756503e-06, - "loss": 0.3622, - "step": 794 - }, - { - "epoch": 3.8780487804878048, - "grad_norm": 3.619878053665161, - "learning_rate": 3.366640402939387e-06, - "loss": 0.1051, - "step": 795 - }, - { - "epoch": 3.8829268292682926, - "grad_norm": 7.085352420806885, - "learning_rate": 3.363045740791698e-06, - "loss": 0.4606, - "step": 796 - }, - { - "epoch": 3.8878048780487804, - "grad_norm": 2.523165464401245, - "learning_rate": 3.3594490517746774e-06, - "loss": 0.2267, - "step": 797 - }, - { - "epoch": 3.892682926829268, - "grad_norm": 2.7026922702789307, - "learning_rate": 3.3558503443351733e-06, - "loss": 0.2792, - "step": 798 - }, - { - "epoch": 3.897560975609756, - "grad_norm": 2.9232428073883057, - "learning_rate": 3.352249626924777e-06, - "loss": 0.2579, - "step": 799 - }, - { - "epoch": 3.902439024390244, - "grad_norm": 4.760788440704346, - "learning_rate": 3.348646907999801e-06, - "loss": 0.6983, - "step": 800 - }, - { - "epoch": 3.9073170731707316, - "grad_norm": 3.198249578475952, - "learning_rate": 3.345042196021257e-06, - "loss": 0.3265, - "step": 801 - }, - { - "epoch": 3.9121951219512194, - "grad_norm": 4.069286823272705, - "learning_rate": 3.3414354994548385e-06, - "loss": 0.497, - "step": 802 - }, - { - "epoch": 3.9170731707317072, - "grad_norm": 3.4435410499572754, - "learning_rate": 3.337826826770898e-06, - "loss": 0.2812, - "step": 803 - }, - { - "epoch": 3.921951219512195, - "grad_norm": 3.9805212020874023, - "learning_rate": 3.3342161864444312e-06, - "loss": 0.2277, - "step": 804 - }, - { - "epoch": 3.926829268292683, - "grad_norm": 3.348925828933716, - "learning_rate": 3.3306035869550534e-06, - "loss": 0.1614, - "step": 805 - }, - { - "epoch": 3.9317073170731707, - "grad_norm": 4.7613701820373535, - "learning_rate": 3.326989036786981e-06, - "loss": 0.3269, - "step": 806 - }, - { - "epoch": 3.9365853658536585, - "grad_norm": 3.807502508163452, - "learning_rate": 3.3233725444290126e-06, - "loss": 0.2619, - "step": 807 - }, - { - "epoch": 3.9414634146341463, - "grad_norm": 3.2690203189849854, - "learning_rate": 3.3197541183745065e-06, - "loss": 0.4334, - "step": 808 - }, - { - "epoch": 3.946341463414634, - "grad_norm": 3.396993398666382, - "learning_rate": 3.3161337671213634e-06, - "loss": 0.2738, - "step": 809 - }, - { - "epoch": 3.951219512195122, - "grad_norm": 3.086669921875, - "learning_rate": 3.312511499172006e-06, - "loss": 0.1597, - "step": 810 - }, - { - "epoch": 3.9560975609756097, - "grad_norm": 3.5688745975494385, - "learning_rate": 3.3088873230333562e-06, - "loss": 0.3195, - "step": 811 - }, - { - "epoch": 3.9609756097560975, - "grad_norm": 3.4843621253967285, - "learning_rate": 3.3052612472168193e-06, - "loss": 0.1865, - "step": 812 - }, - { - "epoch": 3.9658536585365853, - "grad_norm": 2.8479580879211426, - "learning_rate": 3.3016332802382618e-06, - "loss": 0.3108, - "step": 813 - }, - { - "epoch": 3.970731707317073, - "grad_norm": 3.3241543769836426, - "learning_rate": 3.2980034306179897e-06, - "loss": 0.2099, - "step": 814 - }, - { - "epoch": 3.975609756097561, - "grad_norm": 2.817675828933716, - "learning_rate": 3.294371706880733e-06, - "loss": 0.3073, - "step": 815 - }, - { - "epoch": 3.9804878048780488, - "grad_norm": 2.9535388946533203, - "learning_rate": 3.290738117555622e-06, - "loss": 0.2024, - "step": 816 - }, - { - "epoch": 3.9853658536585366, - "grad_norm": 5.021281719207764, - "learning_rate": 3.2871026711761666e-06, - "loss": 0.508, - "step": 817 - }, - { - "epoch": 3.9902439024390244, - "grad_norm": 3.3377649784088135, - "learning_rate": 3.2834653762802414e-06, - "loss": 0.2116, - "step": 818 - }, - { - "epoch": 3.995121951219512, - "grad_norm": 4.412073135375977, - "learning_rate": 3.2798262414100594e-06, - "loss": 0.2177, - "step": 819 - }, - { - "epoch": 4.0, - "grad_norm": 3.174323797225952, - "learning_rate": 3.2761852751121566e-06, - "loss": 0.1737, - "step": 820 - }, - { - "epoch": 4.004878048780488, - "grad_norm": 2.921494960784912, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2569, - "step": 821 - }, - { - "epoch": 4.009756097560976, - "grad_norm": 2.693495512008667, - "learning_rate": 3.2688978824408136e-06, - "loss": 0.1621, - "step": 822 - }, - { - "epoch": 4.014634146341463, - "grad_norm": 2.705796718597412, - "learning_rate": 3.2652514731818698e-06, - "loss": 0.1121, - "step": 823 - }, - { - "epoch": 4.019512195121951, - "grad_norm": 3.2621448040008545, - "learning_rate": 3.2616032667241564e-06, - "loss": 0.0835, - "step": 824 - }, - { - "epoch": 4.024390243902439, - "grad_norm": 3.6205084323883057, - "learning_rate": 3.257953271635513e-06, - "loss": 0.3731, - "step": 825 - }, - { - "epoch": 4.029268292682927, - "grad_norm": 3.2600371837615967, - "learning_rate": 3.2543014964879814e-06, - "loss": 0.1051, - "step": 826 - }, - { - "epoch": 4.034146341463415, - "grad_norm": 3.865178346633911, - "learning_rate": 3.250647949857781e-06, - "loss": 0.0916, - "step": 827 - }, - { - "epoch": 4.0390243902439025, - "grad_norm": 6.9700927734375, - "learning_rate": 3.2469926403252932e-06, - "loss": 0.4037, - "step": 828 - }, - { - "epoch": 4.04390243902439, - "grad_norm": 3.658712148666382, - "learning_rate": 3.2433355764750417e-06, - "loss": 0.0523, - "step": 829 - }, - { - "epoch": 4.048780487804878, - "grad_norm": 4.911301612854004, - "learning_rate": 3.2396767668956656e-06, - "loss": 0.2616, - "step": 830 - }, - { - "epoch": 4.053658536585366, - "grad_norm": 5.019360542297363, - "learning_rate": 3.2360162201799085e-06, - "loss": 0.195, - "step": 831 - }, - { - "epoch": 4.058536585365854, - "grad_norm": 3.493767261505127, - "learning_rate": 3.2323539449245906e-06, - "loss": 0.1245, - "step": 832 - }, - { - "epoch": 4.0634146341463415, - "grad_norm": 4.246248722076416, - "learning_rate": 3.2286899497305917e-06, - "loss": 0.1147, - "step": 833 - }, - { - "epoch": 4.068292682926829, - "grad_norm": 2.993704319000244, - "learning_rate": 3.2250242432028335e-06, - "loss": 0.2189, - "step": 834 - }, - { - "epoch": 4.073170731707317, - "grad_norm": 4.695023059844971, - "learning_rate": 3.221356833950254e-06, - "loss": 0.4685, - "step": 835 - }, - { - "epoch": 4.078048780487805, - "grad_norm": 2.777644634246826, - "learning_rate": 3.21768773058579e-06, - "loss": 0.1245, - "step": 836 - }, - { - "epoch": 4.082926829268293, - "grad_norm": 3.3545901775360107, - "learning_rate": 3.21401694172636e-06, - "loss": 0.1342, - "step": 837 - }, - { - "epoch": 4.087804878048781, - "grad_norm": 2.2222652435302734, - "learning_rate": 3.2103444759928383e-06, - "loss": 0.0484, - "step": 838 - }, - { - "epoch": 4.092682926829268, - "grad_norm": 2.580345630645752, - "learning_rate": 3.2066703420100377e-06, - "loss": 0.0592, - "step": 839 - }, - { - "epoch": 4.097560975609756, - "grad_norm": 3.8652923107147217, - "learning_rate": 3.2029945484066883e-06, - "loss": 0.2536, - "step": 840 - }, - { - "epoch": 4.102439024390244, - "grad_norm": 3.0441582202911377, - "learning_rate": 3.1993171038154203e-06, - "loss": 0.1221, - "step": 841 - }, - { - "epoch": 4.107317073170732, - "grad_norm": 2.2795114517211914, - "learning_rate": 3.1956380168727385e-06, - "loss": 0.1231, - "step": 842 - }, - { - "epoch": 4.11219512195122, - "grad_norm": 3.701009750366211, - "learning_rate": 3.191957296219007e-06, - "loss": 0.2144, - "step": 843 - }, - { - "epoch": 4.117073170731707, - "grad_norm": 3.452637195587158, - "learning_rate": 3.1882749504984247e-06, - "loss": 0.1026, - "step": 844 - }, - { - "epoch": 4.121951219512195, - "grad_norm": 2.4208810329437256, - "learning_rate": 3.1845909883590076e-06, - "loss": 0.1124, - "step": 845 - }, - { - "epoch": 4.126829268292683, - "grad_norm": 4.353063583374023, - "learning_rate": 3.180905418452569e-06, - "loss": 0.2804, - "step": 846 - }, - { - "epoch": 4.131707317073171, - "grad_norm": 3.1151084899902344, - "learning_rate": 3.1772182494346963e-06, - "loss": 0.1748, - "step": 847 - }, - { - "epoch": 4.136585365853659, - "grad_norm": 3.457940101623535, - "learning_rate": 3.1735294899647344e-06, - "loss": 0.1984, - "step": 848 - }, - { - "epoch": 4.1414634146341465, - "grad_norm": 3.3556935787200928, - "learning_rate": 3.169839148705762e-06, - "loss": 0.1332, - "step": 849 - }, - { - "epoch": 4.146341463414634, - "grad_norm": 3.5510823726654053, - "learning_rate": 3.1661472343245725e-06, - "loss": 0.4788, - "step": 850 - }, - { - "epoch": 4.151219512195122, - "grad_norm": 4.036712646484375, - "learning_rate": 3.162453755491655e-06, - "loss": 0.2437, - "step": 851 - }, - { - "epoch": 4.15609756097561, - "grad_norm": 4.417062282562256, - "learning_rate": 3.158758720881171e-06, - "loss": 0.203, - "step": 852 - }, - { - "epoch": 4.160975609756098, - "grad_norm": 3.920558214187622, - "learning_rate": 3.155062139170937e-06, - "loss": 0.1462, - "step": 853 - }, - { - "epoch": 4.1658536585365855, - "grad_norm": 6.472081661224365, - "learning_rate": 3.1513640190424034e-06, - "loss": 0.0972, - "step": 854 - }, - { - "epoch": 4.170731707317073, - "grad_norm": 3.975947141647339, - "learning_rate": 3.147664369180632e-06, - "loss": 0.1092, - "step": 855 - }, - { - "epoch": 4.175609756097561, - "grad_norm": 4.977376937866211, - "learning_rate": 3.143963198274278e-06, - "loss": 0.2215, - "step": 856 - }, - { - "epoch": 4.180487804878049, - "grad_norm": 3.595460891723633, - "learning_rate": 3.140260515015569e-06, - "loss": 0.1771, - "step": 857 - }, - { - "epoch": 4.185365853658537, - "grad_norm": 3.1085658073425293, - "learning_rate": 3.136556328100284e-06, - "loss": 0.1995, - "step": 858 - }, - { - "epoch": 4.190243902439025, - "grad_norm": 4.355626583099365, - "learning_rate": 3.132850646227734e-06, - "loss": 0.4048, - "step": 859 - }, - { - "epoch": 4.195121951219512, - "grad_norm": 3.8079614639282227, - "learning_rate": 3.12914347810074e-06, - "loss": 0.1914, - "step": 860 - }, - { - "epoch": 4.2, - "grad_norm": 3.725804328918457, - "learning_rate": 3.125434832425613e-06, - "loss": 0.1579, - "step": 861 - }, - { - "epoch": 4.204878048780488, - "grad_norm": 2.974649667739868, - "learning_rate": 3.121724717912138e-06, - "loss": 0.1814, - "step": 862 - }, - { - "epoch": 4.209756097560976, - "grad_norm": 3.6391279697418213, - "learning_rate": 3.118013143273542e-06, - "loss": 0.1481, - "step": 863 - }, - { - "epoch": 4.214634146341464, - "grad_norm": 3.216643810272217, - "learning_rate": 3.1143001172264893e-06, - "loss": 0.113, - "step": 864 - }, - { - "epoch": 4.219512195121951, - "grad_norm": 3.605855941772461, - "learning_rate": 3.1105856484910474e-06, - "loss": 0.1405, - "step": 865 - }, - { - "epoch": 4.224390243902439, - "grad_norm": 2.7186765670776367, - "learning_rate": 3.1068697457906736e-06, - "loss": 0.097, - "step": 866 - }, - { - "epoch": 4.229268292682927, - "grad_norm": 3.980973243713379, - "learning_rate": 3.1031524178521938e-06, - "loss": 0.2207, - "step": 867 - }, - { - "epoch": 4.234146341463415, - "grad_norm": 3.4623806476593018, - "learning_rate": 3.0994336734057804e-06, - "loss": 0.0552, - "step": 868 - }, - { - "epoch": 4.239024390243903, - "grad_norm": 3.7556748390197754, - "learning_rate": 3.0957135211849315e-06, - "loss": 0.1743, - "step": 869 - }, - { - "epoch": 4.2439024390243905, - "grad_norm": 3.3547914028167725, - "learning_rate": 3.0919919699264535e-06, - "loss": 0.1195, - "step": 870 - }, - { - "epoch": 4.248780487804878, - "grad_norm": 4.392014503479004, - "learning_rate": 3.0882690283704355e-06, - "loss": 0.6174, - "step": 871 - }, - { - "epoch": 4.253658536585366, - "grad_norm": 2.7031409740448, - "learning_rate": 3.084544705260234e-06, - "loss": 0.1359, - "step": 872 - }, - { - "epoch": 4.258536585365854, - "grad_norm": 2.3518481254577637, - "learning_rate": 3.080819009342451e-06, - "loss": 0.0786, - "step": 873 - }, - { - "epoch": 4.263414634146342, - "grad_norm": 2.636204481124878, - "learning_rate": 3.077091949366908e-06, - "loss": 0.0677, - "step": 874 - }, - { - "epoch": 4.2682926829268295, - "grad_norm": 2.8670942783355713, - "learning_rate": 3.073363534086636e-06, - "loss": 0.1084, - "step": 875 - }, - { - "epoch": 4.273170731707317, - "grad_norm": 2.7044737339019775, - "learning_rate": 3.0696337722578444e-06, - "loss": 0.0681, - "step": 876 - }, - { - "epoch": 4.278048780487805, - "grad_norm": 3.481539487838745, - "learning_rate": 3.0659026726399072e-06, - "loss": 0.2262, - "step": 877 - }, - { - "epoch": 4.282926829268293, - "grad_norm": 3.7746224403381348, - "learning_rate": 3.0621702439953393e-06, - "loss": 0.2169, - "step": 878 - }, - { - "epoch": 4.287804878048781, - "grad_norm": 3.6386263370513916, - "learning_rate": 3.0584364950897768e-06, - "loss": 0.0581, - "step": 879 - }, - { - "epoch": 4.2926829268292686, - "grad_norm": 3.389408588409424, - "learning_rate": 3.0547014346919574e-06, - "loss": 0.1687, - "step": 880 - }, - { - "epoch": 4.297560975609756, - "grad_norm": 3.6510157585144043, - "learning_rate": 3.0509650715736977e-06, - "loss": 0.1362, - "step": 881 - }, - { - "epoch": 4.302439024390244, - "grad_norm": 3.334210157394409, - "learning_rate": 3.0472274145098744e-06, - "loss": 0.1865, - "step": 882 - }, - { - "epoch": 4.307317073170732, - "grad_norm": 4.747341632843018, - "learning_rate": 3.0434884722784026e-06, - "loss": 0.2385, - "step": 883 - }, - { - "epoch": 4.31219512195122, - "grad_norm": 3.9266858100891113, - "learning_rate": 3.0397482536602168e-06, - "loss": 0.1004, - "step": 884 - }, - { - "epoch": 4.317073170731708, - "grad_norm": 2.984821081161499, - "learning_rate": 3.0360067674392475e-06, - "loss": 0.1469, - "step": 885 - }, - { - "epoch": 4.321951219512195, - "grad_norm": 2.6379380226135254, - "learning_rate": 3.0322640224024024e-06, - "loss": 0.0829, - "step": 886 - }, - { - "epoch": 4.326829268292683, - "grad_norm": 3.885495185852051, - "learning_rate": 3.0285200273395478e-06, - "loss": 0.2256, - "step": 887 - }, - { - "epoch": 4.331707317073171, - "grad_norm": 3.950394868850708, - "learning_rate": 3.024774791043481e-06, - "loss": 0.2402, - "step": 888 - }, - { - "epoch": 4.336585365853659, - "grad_norm": 4.147830963134766, - "learning_rate": 3.021028322309921e-06, - "loss": 0.2198, - "step": 889 - }, - { - "epoch": 4.341463414634147, - "grad_norm": 4.0821638107299805, - "learning_rate": 3.0172806299374734e-06, - "loss": 0.2304, - "step": 890 - }, - { - "epoch": 4.3463414634146345, - "grad_norm": 4.142312049865723, - "learning_rate": 3.0135317227276247e-06, - "loss": 0.2864, - "step": 891 - }, - { - "epoch": 4.351219512195122, - "grad_norm": 3.008504867553711, - "learning_rate": 3.0097816094847104e-06, - "loss": 0.2045, - "step": 892 - }, - { - "epoch": 4.35609756097561, - "grad_norm": 3.1674623489379883, - "learning_rate": 3.0060302990158984e-06, - "loss": 0.0864, - "step": 893 - }, - { - "epoch": 4.360975609756098, - "grad_norm": 3.3412492275238037, - "learning_rate": 3.002277800131171e-06, - "loss": 0.076, - "step": 894 - }, - { - "epoch": 4.365853658536586, - "grad_norm": 3.067330837249756, - "learning_rate": 2.998524121643298e-06, - "loss": 0.1724, - "step": 895 - }, - { - "epoch": 4.3707317073170735, - "grad_norm": 3.9015982151031494, - "learning_rate": 2.994769272367822e-06, - "loss": 0.2, - "step": 896 - }, - { - "epoch": 4.375609756097561, - "grad_norm": 3.0136911869049072, - "learning_rate": 2.991013261123035e-06, - "loss": 0.0852, - "step": 897 - }, - { - "epoch": 4.380487804878049, - "grad_norm": 3.6834237575531006, - "learning_rate": 2.9872560967299554e-06, - "loss": 0.1449, - "step": 898 - }, - { - "epoch": 4.385365853658537, - "grad_norm": 3.3486039638519287, - "learning_rate": 2.9834977880123132e-06, - "loss": 0.0659, - "step": 899 - }, - { - "epoch": 4.390243902439025, - "grad_norm": 2.971315622329712, - "learning_rate": 2.9797383437965243e-06, - "loss": 0.1114, - "step": 900 - }, - { - "epoch": 4.3951219512195125, - "grad_norm": 2.683359146118164, - "learning_rate": 2.975977772911671e-06, - "loss": 0.0822, - "step": 901 - }, - { - "epoch": 4.4, - "grad_norm": 2.9941935539245605, - "learning_rate": 2.972216084189482e-06, - "loss": 0.0858, - "step": 902 - }, - { - "epoch": 4.404878048780488, - "grad_norm": 2.4938626289367676, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.1162, - "step": 903 - }, - { - "epoch": 4.409756097560976, - "grad_norm": 2.9364712238311768, - "learning_rate": 2.964689388573118e-06, - "loss": 0.0821, - "step": 904 - }, - { - "epoch": 4.414634146341464, - "grad_norm": 3.3638134002685547, - "learning_rate": 2.9609243993554434e-06, - "loss": 0.25, - "step": 905 - }, - { - "epoch": 4.419512195121952, - "grad_norm": 3.657277822494507, - "learning_rate": 2.9571583276533923e-06, - "loss": 0.0852, - "step": 906 - }, - { - "epoch": 4.424390243902439, - "grad_norm": 5.486263275146484, - "learning_rate": 2.9533911823116124e-06, - "loss": 0.5123, - "step": 907 - }, - { - "epoch": 4.429268292682927, - "grad_norm": 5.194574356079102, - "learning_rate": 2.9496229721772734e-06, - "loss": 0.1854, - "step": 908 - }, - { - "epoch": 4.434146341463415, - "grad_norm": 3.520110845565796, - "learning_rate": 2.9458537061000435e-06, - "loss": 0.1785, - "step": 909 - }, - { - "epoch": 4.439024390243903, - "grad_norm": 3.417991876602173, - "learning_rate": 2.9420833929320726e-06, - "loss": 0.1603, - "step": 910 - }, - { - "epoch": 4.443902439024391, - "grad_norm": 5.225805282592773, - "learning_rate": 2.93831204152797e-06, - "loss": 0.3046, - "step": 911 - }, - { - "epoch": 4.4487804878048784, - "grad_norm": 3.541433572769165, - "learning_rate": 2.9345396607447807e-06, - "loss": 0.0631, - "step": 912 - }, - { - "epoch": 4.453658536585366, - "grad_norm": 3.909377098083496, - "learning_rate": 2.9307662594419704e-06, - "loss": 0.125, - "step": 913 - }, - { - "epoch": 4.458536585365854, - "grad_norm": 3.6604416370391846, - "learning_rate": 2.9269918464814e-06, - "loss": 0.156, - "step": 914 - }, - { - "epoch": 4.463414634146342, - "grad_norm": 3.7413833141326904, - "learning_rate": 2.923216430727306e-06, - "loss": 0.3334, - "step": 915 - }, - { - "epoch": 4.46829268292683, - "grad_norm": 3.531996011734009, - "learning_rate": 2.9194400210462808e-06, - "loss": 0.2534, - "step": 916 - }, - { - "epoch": 4.473170731707317, - "grad_norm": 4.163621425628662, - "learning_rate": 2.91566262630725e-06, - "loss": 0.352, - "step": 917 - }, - { - "epoch": 4.478048780487805, - "grad_norm": 3.923635482788086, - "learning_rate": 2.9118842553814526e-06, - "loss": 0.1132, - "step": 918 - }, - { - "epoch": 4.482926829268292, - "grad_norm": 2.833768844604492, - "learning_rate": 2.9081049171424223e-06, - "loss": 0.086, - "step": 919 - }, - { - "epoch": 4.487804878048781, - "grad_norm": 2.9006292819976807, - "learning_rate": 2.9043246204659624e-06, - "loss": 0.0693, - "step": 920 - }, - { - "epoch": 4.492682926829268, - "grad_norm": 3.699376344680786, - "learning_rate": 2.9005433742301274e-06, - "loss": 0.2463, - "step": 921 - }, - { - "epoch": 4.4975609756097565, - "grad_norm": 4.882141590118408, - "learning_rate": 2.8967611873152037e-06, - "loss": 0.2275, - "step": 922 - }, - { - "epoch": 4.5024390243902435, - "grad_norm": 3.0554678440093994, - "learning_rate": 2.892978068603683e-06, - "loss": 0.0752, - "step": 923 - }, - { - "epoch": 4.507317073170732, - "grad_norm": 3.1225268840789795, - "learning_rate": 2.889194026980249e-06, - "loss": 0.1649, - "step": 924 - }, - { - "epoch": 4.512195121951219, - "grad_norm": 17.75234031677246, - "learning_rate": 2.8854090713317514e-06, - "loss": 0.0437, - "step": 925 - }, - { - "epoch": 4.517073170731708, - "grad_norm": 3.011223554611206, - "learning_rate": 2.8816232105471864e-06, - "loss": 0.0747, - "step": 926 - }, - { - "epoch": 4.521951219512195, - "grad_norm": 4.327573299407959, - "learning_rate": 2.877836453517677e-06, - "loss": 0.3884, - "step": 927 - }, - { - "epoch": 4.526829268292683, - "grad_norm": 3.8694965839385986, - "learning_rate": 2.8740488091364492e-06, - "loss": 0.2741, - "step": 928 - }, - { - "epoch": 4.53170731707317, - "grad_norm": 5.375877380371094, - "learning_rate": 2.870260286298814e-06, - "loss": 0.364, - "step": 929 - }, - { - "epoch": 4.536585365853659, - "grad_norm": 3.380891799926758, - "learning_rate": 2.866470893902147e-06, - "loss": 0.1495, - "step": 930 - }, - { - "epoch": 4.541463414634146, - "grad_norm": 3.723992109298706, - "learning_rate": 2.8626806408458626e-06, - "loss": 0.1403, - "step": 931 - }, - { - "epoch": 4.546341463414635, - "grad_norm": 3.0534417629241943, - "learning_rate": 2.8588895360313983e-06, - "loss": 0.0946, - "step": 932 - }, - { - "epoch": 4.5512195121951216, - "grad_norm": 2.8875234127044678, - "learning_rate": 2.8550975883621935e-06, - "loss": 0.1851, - "step": 933 - }, - { - "epoch": 4.55609756097561, - "grad_norm": 3.532166004180908, - "learning_rate": 2.8513048067436644e-06, - "loss": 0.178, - "step": 934 - }, - { - "epoch": 4.560975609756097, - "grad_norm": 2.942798376083374, - "learning_rate": 2.847511200083187e-06, - "loss": 0.1131, - "step": 935 - }, - { - "epoch": 4.565853658536585, - "grad_norm": 2.926874876022339, - "learning_rate": 2.843716777290074e-06, - "loss": 0.1251, - "step": 936 - }, - { - "epoch": 4.570731707317073, - "grad_norm": 3.525895357131958, - "learning_rate": 2.839921547275556e-06, - "loss": 0.0946, - "step": 937 - }, - { - "epoch": 4.575609756097561, - "grad_norm": 3.7033681869506836, - "learning_rate": 2.836125518952759e-06, - "loss": 0.1529, - "step": 938 - }, - { - "epoch": 4.580487804878048, - "grad_norm": 3.235154867172241, - "learning_rate": 2.8323287012366845e-06, - "loss": 0.2511, - "step": 939 - }, - { - "epoch": 4.585365853658536, - "grad_norm": 3.5275583267211914, - "learning_rate": 2.828531103044186e-06, - "loss": 0.1474, - "step": 940 - }, - { - "epoch": 4.590243902439024, - "grad_norm": 3.1356353759765625, - "learning_rate": 2.8247327332939512e-06, - "loss": 0.2249, - "step": 941 - }, - { - "epoch": 4.595121951219512, - "grad_norm": 3.789210081100464, - "learning_rate": 2.82093360090648e-06, - "loss": 0.2258, - "step": 942 - }, - { - "epoch": 4.6, - "grad_norm": 4.841623306274414, - "learning_rate": 2.8171337148040636e-06, - "loss": 0.2235, - "step": 943 - }, - { - "epoch": 4.6048780487804875, - "grad_norm": 3.161630630493164, - "learning_rate": 2.813333083910761e-06, - "loss": 0.1562, - "step": 944 - }, - { - "epoch": 4.609756097560975, - "grad_norm": 2.8718132972717285, - "learning_rate": 2.8095317171523835e-06, - "loss": 0.0625, - "step": 945 - }, - { - "epoch": 4.614634146341463, - "grad_norm": 3.6432454586029053, - "learning_rate": 2.805729623456469e-06, - "loss": 0.2205, - "step": 946 - }, - { - "epoch": 4.619512195121951, - "grad_norm": 4.382034778594971, - "learning_rate": 2.8019268117522624e-06, - "loss": 0.3241, - "step": 947 - }, - { - "epoch": 4.624390243902439, - "grad_norm": 3.2998175621032715, - "learning_rate": 2.798123290970695e-06, - "loss": 0.1983, - "step": 948 - }, - { - "epoch": 4.6292682926829265, - "grad_norm": 3.8665990829467773, - "learning_rate": 2.794319070044365e-06, - "loss": 0.3391, - "step": 949 - }, - { - "epoch": 4.634146341463414, - "grad_norm": 3.628403425216675, - "learning_rate": 2.790514157907512e-06, - "loss": 0.1329, - "step": 950 - }, - { - "epoch": 4.639024390243902, - "grad_norm": 2.8889615535736084, - "learning_rate": 2.786708563496002e-06, - "loss": 0.141, - "step": 951 - }, - { - "epoch": 4.64390243902439, - "grad_norm": 4.07351541519165, - "learning_rate": 2.782902295747299e-06, - "loss": 0.2935, - "step": 952 - }, - { - "epoch": 4.648780487804878, - "grad_norm": 4.220067024230957, - "learning_rate": 2.7790953636004536e-06, - "loss": 0.318, - "step": 953 - }, - { - "epoch": 4.6536585365853655, - "grad_norm": 3.8444325923919678, - "learning_rate": 2.775287775996074e-06, - "loss": 0.3388, - "step": 954 - }, - { - "epoch": 4.658536585365853, - "grad_norm": 3.197313070297241, - "learning_rate": 2.7714795418763067e-06, - "loss": 0.0925, - "step": 955 - }, - { - "epoch": 4.663414634146341, - "grad_norm": 4.0050811767578125, - "learning_rate": 2.7676706701848187e-06, - "loss": 0.2811, - "step": 956 - }, - { - "epoch": 4.668292682926829, - "grad_norm": 3.217160224914551, - "learning_rate": 2.763861169866774e-06, - "loss": 0.311, - "step": 957 - }, - { - "epoch": 4.673170731707317, - "grad_norm": 2.9892494678497314, - "learning_rate": 2.7600510498688104e-06, - "loss": 0.0582, - "step": 958 - }, - { - "epoch": 4.678048780487805, - "grad_norm": 3.954805374145508, - "learning_rate": 2.7562403191390246e-06, - "loss": 0.1238, - "step": 959 - }, - { - "epoch": 4.682926829268292, - "grad_norm": 2.9582695960998535, - "learning_rate": 2.7524289866269467e-06, - "loss": 0.1243, - "step": 960 - }, - { - "epoch": 4.68780487804878, - "grad_norm": 2.807002544403076, - "learning_rate": 2.748617061283518e-06, - "loss": 0.1388, - "step": 961 - }, - { - "epoch": 4.692682926829268, - "grad_norm": 3.980499505996704, - "learning_rate": 2.744804552061074e-06, - "loss": 0.1144, - "step": 962 - }, - { - "epoch": 4.697560975609756, - "grad_norm": 3.6389007568359375, - "learning_rate": 2.740991467913321e-06, - "loss": 0.2155, - "step": 963 - }, - { - "epoch": 4.702439024390244, - "grad_norm": 3.0950801372528076, - "learning_rate": 2.737177817795315e-06, - "loss": 0.0983, - "step": 964 - }, - { - "epoch": 4.7073170731707314, - "grad_norm": 3.1723053455352783, - "learning_rate": 2.7333636106634414e-06, - "loss": 0.1365, - "step": 965 - }, - { - "epoch": 4.712195121951219, - "grad_norm": 3.83921217918396, - "learning_rate": 2.7295488554753957e-06, - "loss": 0.1977, - "step": 966 - }, - { - "epoch": 4.717073170731707, - "grad_norm": 3.348057746887207, - "learning_rate": 2.725733561190157e-06, - "loss": 0.1311, - "step": 967 - }, - { - "epoch": 4.721951219512195, - "grad_norm": 3.828483819961548, - "learning_rate": 2.721917736767973e-06, - "loss": 0.2464, - "step": 968 - }, - { - "epoch": 4.726829268292683, - "grad_norm": 2.6004624366760254, - "learning_rate": 2.7181013911703357e-06, - "loss": 0.1088, - "step": 969 - }, - { - "epoch": 4.7317073170731705, - "grad_norm": 3.316990852355957, - "learning_rate": 2.714284533359961e-06, - "loss": 0.1492, - "step": 970 - }, - { - "epoch": 4.736585365853658, - "grad_norm": 3.8770010471343994, - "learning_rate": 2.710467172300768e-06, - "loss": 0.218, - "step": 971 - }, - { - "epoch": 4.741463414634146, - "grad_norm": 4.456376552581787, - "learning_rate": 2.706649316957857e-06, - "loss": 0.2199, - "step": 972 - }, - { - "epoch": 4.746341463414634, - "grad_norm": 3.3376309871673584, - "learning_rate": 2.7028309762974897e-06, - "loss": 0.0595, - "step": 973 - }, - { - "epoch": 4.751219512195122, - "grad_norm": 3.6755495071411133, - "learning_rate": 2.699012159287069e-06, - "loss": 0.1653, - "step": 974 - }, - { - "epoch": 4.7560975609756095, - "grad_norm": 2.939887046813965, - "learning_rate": 2.6951928748951125e-06, - "loss": 0.0681, - "step": 975 - }, - { - "epoch": 4.760975609756097, - "grad_norm": 3.4101195335388184, - "learning_rate": 2.69137313209124e-06, - "loss": 0.2046, - "step": 976 - }, - { - "epoch": 4.765853658536585, - "grad_norm": 3.9811208248138428, - "learning_rate": 2.687552939846145e-06, - "loss": 0.2255, - "step": 977 - }, - { - "epoch": 4.770731707317073, - "grad_norm": 3.484255313873291, - "learning_rate": 2.6837323071315766e-06, - "loss": 0.0512, - "step": 978 - }, - { - "epoch": 4.775609756097561, - "grad_norm": 3.9005143642425537, - "learning_rate": 2.679911242920321e-06, - "loss": 0.162, - "step": 979 - }, - { - "epoch": 4.780487804878049, - "grad_norm": 4.933374881744385, - "learning_rate": 2.6760897561861742e-06, - "loss": 0.398, - "step": 980 - }, - { - "epoch": 4.785365853658536, - "grad_norm": 3.0741539001464844, - "learning_rate": 2.672267855903927e-06, - "loss": 0.0507, - "step": 981 - }, - { - "epoch": 4.790243902439024, - "grad_norm": 3.023772716522217, - "learning_rate": 2.6684455510493413e-06, - "loss": 0.2066, - "step": 982 - }, - { - "epoch": 4.795121951219512, - "grad_norm": 3.0102407932281494, - "learning_rate": 2.6646228505991267e-06, - "loss": 0.2296, - "step": 983 - }, - { - "epoch": 4.8, - "grad_norm": 3.902200222015381, - "learning_rate": 2.6607997635309246e-06, - "loss": 0.14, - "step": 984 - }, - { - "epoch": 4.804878048780488, - "grad_norm": 3.836185932159424, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.1583, - "step": 985 - }, - { - "epoch": 4.809756097560975, - "grad_norm": 3.539628744125366, - "learning_rate": 2.653152465455639e-06, - "loss": 0.2619, - "step": 986 - }, - { - "epoch": 4.814634146341463, - "grad_norm": 4.716914653778076, - "learning_rate": 2.6493282724082913e-06, - "loss": 0.3029, - "step": 987 - }, - { - "epoch": 4.819512195121951, - "grad_norm": 3.466914176940918, - "learning_rate": 2.6455037286623864e-06, - "loss": 0.095, - "step": 988 - }, - { - "epoch": 4.824390243902439, - "grad_norm": 2.1798667907714844, - "learning_rate": 2.6416788431998935e-06, - "loss": 0.1232, - "step": 989 - }, - { - "epoch": 4.829268292682927, - "grad_norm": 3.309039354324341, - "learning_rate": 2.637853625003585e-06, - "loss": 0.3671, - "step": 990 - }, - { - "epoch": 4.8341463414634145, - "grad_norm": 3.2619435787200928, - "learning_rate": 2.6340280830570142e-06, - "loss": 0.194, - "step": 991 - }, - { - "epoch": 4.839024390243902, - "grad_norm": 3.601161003112793, - "learning_rate": 2.6302022263444947e-06, - "loss": 0.1214, - "step": 992 - }, - { - "epoch": 4.84390243902439, - "grad_norm": 4.13787841796875, - "learning_rate": 2.6263760638510793e-06, - "loss": 0.311, - "step": 993 - }, - { - "epoch": 4.848780487804878, - "grad_norm": 3.0474166870117188, - "learning_rate": 2.6225496045625394e-06, - "loss": 0.1853, - "step": 994 - }, - { - "epoch": 4.853658536585366, - "grad_norm": 4.481237411499023, - "learning_rate": 2.6187228574653428e-06, - "loss": 0.2088, - "step": 995 - }, - { - "epoch": 4.8585365853658535, - "grad_norm": 3.235966444015503, - "learning_rate": 2.614895831546633e-06, - "loss": 0.1439, - "step": 996 - }, - { - "epoch": 4.863414634146341, - "grad_norm": 4.103270053863525, - "learning_rate": 2.6110685357942096e-06, - "loss": 0.2823, - "step": 997 - }, - { - "epoch": 4.868292682926829, - "grad_norm": 4.134536266326904, - "learning_rate": 2.6072409791965048e-06, - "loss": 0.2963, - "step": 998 - }, - { - "epoch": 4.873170731707317, - "grad_norm": 4.124892711639404, - "learning_rate": 2.6034131707425638e-06, - "loss": 0.4127, - "step": 999 - }, - { - "epoch": 4.878048780487805, - "grad_norm": 3.565139055252075, - "learning_rate": 2.5995851194220223e-06, - "loss": 0.1601, - "step": 1000 - }, - { - "epoch": 4.882926829268293, - "grad_norm": 2.7548017501831055, - "learning_rate": 2.595756834225089e-06, - "loss": 0.161, - "step": 1001 - }, - { - "epoch": 4.88780487804878, - "grad_norm": 3.9297611713409424, - "learning_rate": 2.5919283241425188e-06, - "loss": 0.1013, - "step": 1002 - }, - { - "epoch": 4.892682926829268, - "grad_norm": 2.4904236793518066, - "learning_rate": 2.5880995981655965e-06, - "loss": 0.1177, - "step": 1003 - }, - { - "epoch": 4.897560975609756, - "grad_norm": 3.513308048248291, - "learning_rate": 2.584270665286113e-06, - "loss": 0.0682, - "step": 1004 - }, - { - "epoch": 4.902439024390244, - "grad_norm": 4.221067428588867, - "learning_rate": 2.580441534496346e-06, - "loss": 0.1502, - "step": 1005 - }, - { - "epoch": 4.907317073170732, - "grad_norm": 3.4298903942108154, - "learning_rate": 2.576612214789039e-06, - "loss": 0.1772, - "step": 1006 - }, - { - "epoch": 4.912195121951219, - "grad_norm": 4.402887344360352, - "learning_rate": 2.5727827151573747e-06, - "loss": 0.2029, - "step": 1007 - }, - { - "epoch": 4.917073170731707, - "grad_norm": 4.194999694824219, - "learning_rate": 2.568953044594964e-06, - "loss": 0.1269, - "step": 1008 - }, - { - "epoch": 4.921951219512195, - "grad_norm": 3.657607078552246, - "learning_rate": 2.5651232120958157e-06, - "loss": 0.1311, - "step": 1009 - }, - { - "epoch": 4.926829268292683, - "grad_norm": 4.092184543609619, - "learning_rate": 2.56129322665432e-06, - "loss": 0.1085, - "step": 1010 - }, - { - "epoch": 4.931707317073171, - "grad_norm": 3.3648242950439453, - "learning_rate": 2.5574630972652263e-06, - "loss": 0.0782, - "step": 1011 - }, - { - "epoch": 4.9365853658536585, - "grad_norm": 3.7215166091918945, - "learning_rate": 2.553632832923622e-06, - "loss": 0.1391, - "step": 1012 - }, - { - "epoch": 4.941463414634146, - "grad_norm": 4.045740127563477, - "learning_rate": 2.5498024426249107e-06, - "loss": 0.3141, - "step": 1013 - }, - { - "epoch": 4.946341463414634, - "grad_norm": 3.2363107204437256, - "learning_rate": 2.545971935364794e-06, - "loss": 0.0679, - "step": 1014 - }, - { - "epoch": 4.951219512195122, - "grad_norm": 3.057283639907837, - "learning_rate": 2.5421413201392443e-06, - "loss": 0.1382, - "step": 1015 - }, - { - "epoch": 4.95609756097561, - "grad_norm": 3.591535806655884, - "learning_rate": 2.538310605944491e-06, - "loss": 0.112, - "step": 1016 - }, - { - "epoch": 4.9609756097560975, - "grad_norm": 3.1629281044006348, - "learning_rate": 2.534479801776996e-06, - "loss": 0.1261, - "step": 1017 - }, - { - "epoch": 4.965853658536585, - "grad_norm": 2.691740036010742, - "learning_rate": 2.53064891663343e-06, - "loss": 0.2328, - "step": 1018 - }, - { - "epoch": 4.970731707317073, - "grad_norm": 3.2620503902435303, - "learning_rate": 2.526817959510655e-06, - "loss": 0.193, - "step": 1019 - }, - { - "epoch": 4.975609756097561, - "grad_norm": 3.0721535682678223, - "learning_rate": 2.5229869394057038e-06, - "loss": 0.2444, - "step": 1020 - }, - { - "epoch": 4.980487804878049, - "grad_norm": 2.6279208660125732, - "learning_rate": 2.5191558653157542e-06, - "loss": 0.1103, - "step": 1021 - }, - { - "epoch": 4.985365853658537, - "grad_norm": 2.9295670986175537, - "learning_rate": 2.515324746238113e-06, - "loss": 0.0553, - "step": 1022 - }, - { - "epoch": 4.990243902439024, - "grad_norm": 3.3960084915161133, - "learning_rate": 2.511493591170191e-06, - "loss": 0.1686, - "step": 1023 - }, - { - "epoch": 4.995121951219512, - "grad_norm": 4.138705253601074, - "learning_rate": 2.5076624091094846e-06, - "loss": 0.1208, - "step": 1024 - }, - { - "epoch": 5.0, - "grad_norm": 2.603870391845703, - "learning_rate": 2.503831209053554e-06, - "loss": 0.1216, - "step": 1025 - }, - { - "epoch": 5.004878048780488, - "grad_norm": 2.525205612182617, - "learning_rate": 2.5e-06, - "loss": 0.0984, - "step": 1026 - }, - { - "epoch": 5.009756097560976, - "grad_norm": 3.2502501010894775, - "learning_rate": 2.4961687909464462e-06, - "loss": 0.1323, - "step": 1027 - }, - { - "epoch": 5.014634146341463, - "grad_norm": 5.363409519195557, - "learning_rate": 2.492337590890516e-06, - "loss": 0.3516, - "step": 1028 - }, - { - "epoch": 5.019512195121951, - "grad_norm": 2.887723445892334, - "learning_rate": 2.4885064088298097e-06, - "loss": 0.1931, - "step": 1029 - }, - { - "epoch": 5.024390243902439, - "grad_norm": 3.4529435634613037, - "learning_rate": 2.4846752537618875e-06, - "loss": 0.0675, - "step": 1030 - }, - { - "epoch": 5.029268292682927, - "grad_norm": 4.202361106872559, - "learning_rate": 2.480844134684246e-06, - "loss": 0.1643, - "step": 1031 - }, - { - "epoch": 5.034146341463415, - "grad_norm": 2.910275459289551, - "learning_rate": 2.4770130605942966e-06, - "loss": 0.11, - "step": 1032 - }, - { - "epoch": 5.0390243902439025, - "grad_norm": 3.5430362224578857, - "learning_rate": 2.4731820404893457e-06, - "loss": 0.0614, - "step": 1033 - }, - { - "epoch": 5.04390243902439, - "grad_norm": 4.501879692077637, - "learning_rate": 2.469351083366571e-06, - "loss": 0.0954, - "step": 1034 - }, - { - "epoch": 5.048780487804878, - "grad_norm": 2.732261896133423, - "learning_rate": 2.4655201982230044e-06, - "loss": 0.0275, - "step": 1035 - }, - { - "epoch": 5.053658536585366, - "grad_norm": 3.5926437377929688, - "learning_rate": 2.4616893940555094e-06, - "loss": 0.0661, - "step": 1036 - }, - { - "epoch": 5.058536585365854, - "grad_norm": 4.790312767028809, - "learning_rate": 2.457858679860757e-06, - "loss": 0.2976, - "step": 1037 - }, - { - "epoch": 5.0634146341463415, - "grad_norm": 4.453246116638184, - "learning_rate": 2.4540280646352072e-06, - "loss": 0.1216, - "step": 1038 - }, - { - "epoch": 5.068292682926829, - "grad_norm": 3.288011074066162, - "learning_rate": 2.45019755737509e-06, - "loss": 0.0877, - "step": 1039 - }, - { - "epoch": 5.073170731707317, - "grad_norm": 3.566927671432495, - "learning_rate": 2.4463671670763787e-06, - "loss": 0.1661, - "step": 1040 - }, - { - "epoch": 5.078048780487805, - "grad_norm": 3.250047206878662, - "learning_rate": 2.4425369027347746e-06, - "loss": 0.211, - "step": 1041 - }, - { - "epoch": 5.082926829268293, - "grad_norm": 3.0214977264404297, - "learning_rate": 2.4387067733456804e-06, - "loss": 0.093, - "step": 1042 - }, - { - "epoch": 5.087804878048781, - "grad_norm": 3.8162097930908203, - "learning_rate": 2.4348767879041847e-06, - "loss": 0.0777, - "step": 1043 - }, - { - "epoch": 5.092682926829268, - "grad_norm": 3.8071560859680176, - "learning_rate": 2.4310469554050366e-06, - "loss": 0.087, - "step": 1044 - }, - { - "epoch": 5.097560975609756, - "grad_norm": 3.1032073497772217, - "learning_rate": 2.4272172848426257e-06, - "loss": 0.1105, - "step": 1045 - }, - { - "epoch": 5.102439024390244, - "grad_norm": 2.8980185985565186, - "learning_rate": 2.423387785210962e-06, - "loss": 0.0704, - "step": 1046 - }, - { - "epoch": 5.107317073170732, - "grad_norm": 3.9110755920410156, - "learning_rate": 2.4195584655036544e-06, - "loss": 0.2118, - "step": 1047 - }, - { - "epoch": 5.11219512195122, - "grad_norm": 2.678884506225586, - "learning_rate": 2.4157293347138877e-06, - "loss": 0.0664, - "step": 1048 - }, - { - "epoch": 5.117073170731707, - "grad_norm": 3.183046340942383, - "learning_rate": 2.4119004018344043e-06, - "loss": 0.1767, - "step": 1049 - }, - { - "epoch": 5.121951219512195, - "grad_norm": 3.9198925495147705, - "learning_rate": 2.408071675857482e-06, - "loss": 0.1288, - "step": 1050 - }, - { - "epoch": 5.126829268292683, - "grad_norm": 4.378621578216553, - "learning_rate": 2.404243165774912e-06, - "loss": 0.1724, - "step": 1051 - }, - { - "epoch": 5.131707317073171, - "grad_norm": 2.5509133338928223, - "learning_rate": 2.4004148805779785e-06, - "loss": 0.0382, - "step": 1052 - }, - { - "epoch": 5.136585365853659, - "grad_norm": 3.692396402359009, - "learning_rate": 2.3965868292574375e-06, - "loss": 0.0942, - "step": 1053 - }, - { - "epoch": 5.1414634146341465, - "grad_norm": 3.8537800312042236, - "learning_rate": 2.392759020803496e-06, - "loss": 0.0819, - "step": 1054 - }, - { - "epoch": 5.146341463414634, - "grad_norm": 4.02876091003418, - "learning_rate": 2.3889314642057916e-06, - "loss": 0.0866, - "step": 1055 - }, - { - "epoch": 5.151219512195122, - "grad_norm": 3.531857490539551, - "learning_rate": 2.3851041684533677e-06, - "loss": 0.1557, - "step": 1056 - }, - { - "epoch": 5.15609756097561, - "grad_norm": 2.231265068054199, - "learning_rate": 2.381277142534658e-06, - "loss": 0.0421, - "step": 1057 - }, - { - "epoch": 5.160975609756098, - "grad_norm": 3.159226894378662, - "learning_rate": 2.3774503954374614e-06, - "loss": 0.0395, - "step": 1058 - }, - { - "epoch": 5.1658536585365855, - "grad_norm": 3.0375123023986816, - "learning_rate": 2.373623936148921e-06, - "loss": 0.1869, - "step": 1059 - }, - { - "epoch": 5.170731707317073, - "grad_norm": 5.4905900955200195, - "learning_rate": 2.369797773655506e-06, - "loss": 0.1426, - "step": 1060 - }, - { - "epoch": 5.175609756097561, - "grad_norm": 2.8739638328552246, - "learning_rate": 2.3659719169429866e-06, - "loss": 0.0788, - "step": 1061 - }, - { - "epoch": 5.180487804878049, - "grad_norm": 2.612183094024658, - "learning_rate": 2.3621463749964153e-06, - "loss": 0.0449, - "step": 1062 - }, - { - "epoch": 5.185365853658537, - "grad_norm": 2.0573198795318604, - "learning_rate": 2.3583211568001073e-06, - "loss": 0.0264, - "step": 1063 - }, - { - "epoch": 5.190243902439025, - "grad_norm": 2.3667244911193848, - "learning_rate": 2.3544962713376144e-06, - "loss": 0.0507, - "step": 1064 - }, - { - "epoch": 5.195121951219512, - "grad_norm": 2.1223740577697754, - "learning_rate": 2.3506717275917095e-06, - "loss": 0.0576, - "step": 1065 - }, - { - "epoch": 5.2, - "grad_norm": 2.2630319595336914, - "learning_rate": 2.346847534544362e-06, - "loss": 0.0523, - "step": 1066 - }, - { - "epoch": 5.204878048780488, - "grad_norm": 3.201913595199585, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.0847, - "step": 1067 - }, - { - "epoch": 5.209756097560976, - "grad_norm": 2.2149481773376465, - "learning_rate": 2.3392002364690762e-06, - "loss": 0.0215, - "step": 1068 - }, - { - "epoch": 5.214634146341464, - "grad_norm": 4.425244331359863, - "learning_rate": 2.335377149400874e-06, - "loss": 0.1018, - "step": 1069 - }, - { - "epoch": 5.219512195121951, - "grad_norm": 4.548358917236328, - "learning_rate": 2.3315544489506596e-06, - "loss": 0.1485, - "step": 1070 - }, - { - "epoch": 5.224390243902439, - "grad_norm": 3.635796546936035, - "learning_rate": 2.3277321440960733e-06, - "loss": 0.111, - "step": 1071 - }, - { - "epoch": 5.229268292682927, - "grad_norm": 2.3180043697357178, - "learning_rate": 2.323910243813826e-06, - "loss": 0.0267, - "step": 1072 - }, - { - "epoch": 5.234146341463415, - "grad_norm": 3.675490379333496, - "learning_rate": 2.3200887570796798e-06, - "loss": 0.153, - "step": 1073 - }, - { - "epoch": 5.239024390243903, - "grad_norm": 2.883225202560425, - "learning_rate": 2.316267692868424e-06, - "loss": 0.0968, - "step": 1074 - }, - { - "epoch": 5.2439024390243905, - "grad_norm": 3.0320188999176025, - "learning_rate": 2.312447060153856e-06, - "loss": 0.0786, - "step": 1075 - }, - { - "epoch": 5.248780487804878, - "grad_norm": 2.682695150375366, - "learning_rate": 2.308626867908761e-06, - "loss": 0.0677, - "step": 1076 - }, - { - "epoch": 5.253658536585366, - "grad_norm": 3.941967010498047, - "learning_rate": 2.3048071251048884e-06, - "loss": 0.1059, - "step": 1077 - }, - { - "epoch": 5.258536585365854, - "grad_norm": 6.485599517822266, - "learning_rate": 2.300987840712932e-06, - "loss": 0.1331, - "step": 1078 - }, - { - "epoch": 5.263414634146342, - "grad_norm": 3.809269905090332, - "learning_rate": 2.297169023702511e-06, - "loss": 0.169, - "step": 1079 - }, - { - "epoch": 5.2682926829268295, - "grad_norm": 3.115626573562622, - "learning_rate": 2.2933506830421436e-06, - "loss": 0.1349, - "step": 1080 - }, - { - "epoch": 5.273170731707317, - "grad_norm": 2.2234909534454346, - "learning_rate": 2.2895328276992325e-06, - "loss": 0.0191, - "step": 1081 - }, - { - "epoch": 5.278048780487805, - "grad_norm": 3.896925926208496, - "learning_rate": 2.28571546664004e-06, - "loss": 0.1961, - "step": 1082 - }, - { - "epoch": 5.282926829268293, - "grad_norm": 2.4134509563446045, - "learning_rate": 2.281898608829665e-06, - "loss": 0.02, - "step": 1083 - }, - { - "epoch": 5.287804878048781, - "grad_norm": 2.7599191665649414, - "learning_rate": 2.2780822632320273e-06, - "loss": 0.0763, - "step": 1084 - }, - { - "epoch": 5.2926829268292686, - "grad_norm": 2.465637683868408, - "learning_rate": 2.2742664388098435e-06, - "loss": 0.0403, - "step": 1085 - }, - { - "epoch": 5.297560975609756, - "grad_norm": 2.4026618003845215, - "learning_rate": 2.270451144524605e-06, - "loss": 0.0982, - "step": 1086 - }, - { - "epoch": 5.302439024390244, - "grad_norm": 3.3339459896087646, - "learning_rate": 2.266636389336559e-06, - "loss": 0.09, - "step": 1087 - }, - { - "epoch": 5.307317073170732, - "grad_norm": 2.113255023956299, - "learning_rate": 2.262822182204686e-06, - "loss": 0.0267, - "step": 1088 - }, - { - "epoch": 5.31219512195122, - "grad_norm": 3.1760852336883545, - "learning_rate": 2.2590085320866798e-06, - "loss": 0.0295, - "step": 1089 - }, - { - "epoch": 5.317073170731708, - "grad_norm": 2.9674434661865234, - "learning_rate": 2.255195447938927e-06, - "loss": 0.0261, - "step": 1090 - }, - { - "epoch": 5.321951219512195, - "grad_norm": 3.4384074211120605, - "learning_rate": 2.251382938716482e-06, - "loss": 0.0936, - "step": 1091 - }, - { - "epoch": 5.326829268292683, - "grad_norm": 3.3814568519592285, - "learning_rate": 2.2475710133730533e-06, - "loss": 0.0426, - "step": 1092 - }, - { - "epoch": 5.331707317073171, - "grad_norm": 3.081317663192749, - "learning_rate": 2.243759680860975e-06, - "loss": 0.0799, - "step": 1093 - }, - { - "epoch": 5.336585365853659, - "grad_norm": 3.5608482360839844, - "learning_rate": 2.2399489501311896e-06, - "loss": 0.0906, - "step": 1094 - }, - { - "epoch": 5.341463414634147, - "grad_norm": 3.7886314392089844, - "learning_rate": 2.2361388301332265e-06, - "loss": 0.2152, - "step": 1095 - }, - { - "epoch": 5.3463414634146345, - "grad_norm": 1.9531102180480957, - "learning_rate": 2.2323293298151817e-06, - "loss": 0.0359, - "step": 1096 - }, - { - "epoch": 5.351219512195122, - "grad_norm": 2.2828023433685303, - "learning_rate": 2.2285204581236937e-06, - "loss": 0.0368, - "step": 1097 - }, - { - "epoch": 5.35609756097561, - "grad_norm": 3.110262870788574, - "learning_rate": 2.2247122240039268e-06, - "loss": 0.0426, - "step": 1098 - }, - { - "epoch": 5.360975609756098, - "grad_norm": 2.3293566703796387, - "learning_rate": 2.2209046363995464e-06, - "loss": 0.0223, - "step": 1099 - }, - { - "epoch": 5.365853658536586, - "grad_norm": 2.990884780883789, - "learning_rate": 2.217097704252701e-06, - "loss": 0.1276, - "step": 1100 - }, - { - "epoch": 5.3707317073170735, - "grad_norm": 2.568014144897461, - "learning_rate": 2.2132914365039993e-06, - "loss": 0.0639, - "step": 1101 - }, - { - "epoch": 5.375609756097561, - "grad_norm": 2.618478536605835, - "learning_rate": 2.2094858420924882e-06, - "loss": 0.0166, - "step": 1102 - }, - { - "epoch": 5.380487804878049, - "grad_norm": 4.526919364929199, - "learning_rate": 2.205680929955635e-06, - "loss": 0.144, - "step": 1103 - }, - { - "epoch": 5.385365853658537, - "grad_norm": 2.7236886024475098, - "learning_rate": 2.201876709029305e-06, - "loss": 0.1004, - "step": 1104 - }, - { - "epoch": 5.390243902439025, - "grad_norm": 2.1577632427215576, - "learning_rate": 2.198073188247738e-06, - "loss": 0.0453, - "step": 1105 - }, - { - "epoch": 5.3951219512195125, - "grad_norm": 2.5170321464538574, - "learning_rate": 2.1942703765435317e-06, - "loss": 0.0195, - "step": 1106 - }, - { - "epoch": 5.4, - "grad_norm": 3.962658643722534, - "learning_rate": 2.190468282847617e-06, - "loss": 0.1512, - "step": 1107 - }, - { - "epoch": 5.404878048780488, - "grad_norm": 4.297860622406006, - "learning_rate": 2.186666916089239e-06, - "loss": 0.2572, - "step": 1108 - }, - { - "epoch": 5.409756097560976, - "grad_norm": 2.8933565616607666, - "learning_rate": 2.1828662851959377e-06, - "loss": 0.0536, - "step": 1109 - }, - { - "epoch": 5.414634146341464, - "grad_norm": 2.9397451877593994, - "learning_rate": 2.1790663990935203e-06, - "loss": 0.0778, - "step": 1110 - }, - { - "epoch": 5.419512195121952, - "grad_norm": 3.5210094451904297, - "learning_rate": 2.1752672667060488e-06, - "loss": 0.0558, - "step": 1111 - }, - { - "epoch": 5.424390243902439, - "grad_norm": 2.9027626514434814, - "learning_rate": 2.1714688969558146e-06, - "loss": 0.041, - "step": 1112 - }, - { - "epoch": 5.429268292682927, - "grad_norm": 3.7691168785095215, - "learning_rate": 2.167671298763316e-06, - "loss": 0.1644, - "step": 1113 - }, - { - "epoch": 5.434146341463415, - "grad_norm": 3.493008852005005, - "learning_rate": 2.1638744810472414e-06, - "loss": 0.1587, - "step": 1114 - }, - { - "epoch": 5.439024390243903, - "grad_norm": 2.711196184158325, - "learning_rate": 2.1600784527244445e-06, - "loss": 0.0605, - "step": 1115 - }, - { - "epoch": 5.443902439024391, - "grad_norm": 4.365038871765137, - "learning_rate": 2.1562832227099266e-06, - "loss": 0.1897, - "step": 1116 - }, - { - "epoch": 5.4487804878048784, - "grad_norm": 4.621466159820557, - "learning_rate": 2.152488799916814e-06, - "loss": 0.1525, - "step": 1117 - }, - { - "epoch": 5.453658536585366, - "grad_norm": 4.8721089363098145, - "learning_rate": 2.148695193256336e-06, - "loss": 0.189, - "step": 1118 - }, - { - "epoch": 5.458536585365854, - "grad_norm": 2.8999173641204834, - "learning_rate": 2.1449024116378064e-06, - "loss": 0.095, - "step": 1119 - }, - { - "epoch": 5.463414634146342, - "grad_norm": 2.4865314960479736, - "learning_rate": 2.1411104639686013e-06, - "loss": 0.0432, - "step": 1120 - }, - { - "epoch": 5.46829268292683, - "grad_norm": 3.8497228622436523, - "learning_rate": 2.137319359154138e-06, - "loss": 0.0954, - "step": 1121 - }, - { - "epoch": 5.473170731707317, - "grad_norm": 2.3643507957458496, - "learning_rate": 2.133529106097853e-06, - "loss": 0.0362, - "step": 1122 - }, - { - "epoch": 5.478048780487805, - "grad_norm": 3.017826795578003, - "learning_rate": 2.1297397137011862e-06, - "loss": 0.0875, - "step": 1123 - }, - { - "epoch": 5.482926829268292, - "grad_norm": 3.239320755004883, - "learning_rate": 2.125951190863551e-06, - "loss": 0.0758, - "step": 1124 - }, - { - "epoch": 5.487804878048781, - "grad_norm": 2.566241979598999, - "learning_rate": 2.1221635464823237e-06, - "loss": 0.0605, - "step": 1125 - }, - { - "epoch": 5.492682926829268, - "grad_norm": 4.810088157653809, - "learning_rate": 2.1183767894528135e-06, - "loss": 0.2403, - "step": 1126 - }, - { - "epoch": 5.4975609756097565, - "grad_norm": 2.083263397216797, - "learning_rate": 2.114590928668249e-06, - "loss": 0.0223, - "step": 1127 - }, - { - "epoch": 5.5024390243902435, - "grad_norm": 2.6812374591827393, - "learning_rate": 2.1108059730197517e-06, - "loss": 0.0617, - "step": 1128 - }, - { - "epoch": 5.507317073170732, - "grad_norm": 3.196735143661499, - "learning_rate": 2.1070219313963173e-06, - "loss": 0.043, - "step": 1129 - }, - { - "epoch": 5.512195121951219, - "grad_norm": 2.775470495223999, - "learning_rate": 2.1032388126847967e-06, - "loss": 0.0595, - "step": 1130 - }, - { - "epoch": 5.517073170731708, - "grad_norm": 2.8632407188415527, - "learning_rate": 2.099456625769872e-06, - "loss": 0.0186, - "step": 1131 - }, - { - "epoch": 5.521951219512195, - "grad_norm": 4.075018405914307, - "learning_rate": 2.0956753795340376e-06, - "loss": 0.0616, - "step": 1132 - }, - { - "epoch": 5.526829268292683, - "grad_norm": 3.206327199935913, - "learning_rate": 2.091895082857578e-06, - "loss": 0.1895, - "step": 1133 - }, - { - "epoch": 5.53170731707317, - "grad_norm": 2.967588186264038, - "learning_rate": 2.0881157446185474e-06, - "loss": 0.0484, - "step": 1134 - }, - { - "epoch": 5.536585365853659, - "grad_norm": 2.850929021835327, - "learning_rate": 2.0843373736927506e-06, - "loss": 0.037, - "step": 1135 - }, - { - "epoch": 5.541463414634146, - "grad_norm": 2.2505147457122803, - "learning_rate": 2.08055997895372e-06, - "loss": 0.0227, - "step": 1136 - }, - { - "epoch": 5.546341463414635, - "grad_norm": 2.5258476734161377, - "learning_rate": 2.0767835692726944e-06, - "loss": 0.0296, - "step": 1137 - }, - { - "epoch": 5.5512195121951216, - "grad_norm": 3.498741388320923, - "learning_rate": 2.0730081535186e-06, - "loss": 0.16, - "step": 1138 - }, - { - "epoch": 5.55609756097561, - "grad_norm": 2.8635222911834717, - "learning_rate": 2.06923374055803e-06, - "loss": 0.0725, - "step": 1139 - }, - { - "epoch": 5.560975609756097, - "grad_norm": 2.2779290676116943, - "learning_rate": 2.0654603392552193e-06, - "loss": 0.0198, - "step": 1140 - }, - { - "epoch": 5.565853658536585, - "grad_norm": 3.1651058197021484, - "learning_rate": 2.0616879584720305e-06, - "loss": 0.1144, - "step": 1141 - }, - { - "epoch": 5.570731707317073, - "grad_norm": 2.4238595962524414, - "learning_rate": 2.057916607067928e-06, - "loss": 0.0491, - "step": 1142 - }, - { - "epoch": 5.575609756097561, - "grad_norm": 2.3248515129089355, - "learning_rate": 2.054146293899957e-06, - "loss": 0.035, - "step": 1143 - }, - { - "epoch": 5.580487804878048, - "grad_norm": 2.9506516456604004, - "learning_rate": 2.0503770278227274e-06, - "loss": 0.0639, - "step": 1144 - }, - { - "epoch": 5.585365853658536, - "grad_norm": 2.6403958797454834, - "learning_rate": 2.0466088176883876e-06, - "loss": 0.0258, - "step": 1145 - }, - { - "epoch": 5.590243902439024, - "grad_norm": 3.150115728378296, - "learning_rate": 2.042841672346608e-06, - "loss": 0.0634, - "step": 1146 - }, - { - "epoch": 5.595121951219512, - "grad_norm": 2.742691993713379, - "learning_rate": 2.039075600644557e-06, - "loss": 0.0464, - "step": 1147 - }, - { - "epoch": 5.6, - "grad_norm": 2.733694076538086, - "learning_rate": 2.0353106114268824e-06, - "loss": 0.0829, - "step": 1148 - }, - { - "epoch": 5.6048780487804875, - "grad_norm": 2.511229991912842, - "learning_rate": 2.031546713535688e-06, - "loss": 0.0321, - "step": 1149 - }, - { - "epoch": 5.609756097560975, - "grad_norm": 3.019669532775879, - "learning_rate": 2.027783915810518e-06, - "loss": 0.05, - "step": 1150 - }, - { - "epoch": 5.614634146341463, - "grad_norm": 3.497159242630005, - "learning_rate": 2.024022227088329e-06, - "loss": 0.1984, - "step": 1151 - }, - { - "epoch": 5.619512195121951, - "grad_norm": 3.4637508392333984, - "learning_rate": 2.020261656203476e-06, - "loss": 0.1673, - "step": 1152 - }, - { - "epoch": 5.624390243902439, - "grad_norm": 2.4312477111816406, - "learning_rate": 2.016502211987687e-06, - "loss": 0.1106, - "step": 1153 - }, - { - "epoch": 5.6292682926829265, - "grad_norm": 2.7801673412323, - "learning_rate": 2.0127439032700446e-06, - "loss": 0.0374, - "step": 1154 - }, - { - "epoch": 5.634146341463414, - "grad_norm": 2.9346680641174316, - "learning_rate": 2.0089867388769664e-06, - "loss": 0.0674, - "step": 1155 - }, - { - "epoch": 5.639024390243902, - "grad_norm": 2.274888277053833, - "learning_rate": 2.0052307276321793e-06, - "loss": 0.0365, - "step": 1156 - }, - { - "epoch": 5.64390243902439, - "grad_norm": 3.069890022277832, - "learning_rate": 2.001475878356703e-06, - "loss": 0.0758, - "step": 1157 - }, - { - "epoch": 5.648780487804878, - "grad_norm": 3.8594915866851807, - "learning_rate": 1.99772219986883e-06, - "loss": 0.176, - "step": 1158 - }, - { - "epoch": 5.6536585365853655, - "grad_norm": 3.4886410236358643, - "learning_rate": 1.9939697009841024e-06, - "loss": 0.0491, - "step": 1159 - }, - { - "epoch": 5.658536585365853, - "grad_norm": 2.697946786880493, - "learning_rate": 1.990218390515291e-06, - "loss": 0.0741, - "step": 1160 - }, - { - "epoch": 5.663414634146341, - "grad_norm": 3.5290887355804443, - "learning_rate": 1.9864682772723757e-06, - "loss": 0.0826, - "step": 1161 - }, - { - "epoch": 5.668292682926829, - "grad_norm": 2.0601298809051514, - "learning_rate": 1.9827193700625274e-06, - "loss": 0.0378, - "step": 1162 - }, - { - "epoch": 5.673170731707317, - "grad_norm": 3.8458635807037354, - "learning_rate": 1.978971677690081e-06, - "loss": 0.2466, - "step": 1163 - }, - { - "epoch": 5.678048780487805, - "grad_norm": 2.788210153579712, - "learning_rate": 1.97522520895652e-06, - "loss": 0.0205, - "step": 1164 - }, - { - "epoch": 5.682926829268292, - "grad_norm": 3.1904587745666504, - "learning_rate": 1.971479972660454e-06, - "loss": 0.0998, - "step": 1165 - }, - { - "epoch": 5.68780487804878, - "grad_norm": 2.4664318561553955, - "learning_rate": 1.967735977597598e-06, - "loss": 0.0217, - "step": 1166 - }, - { - "epoch": 5.692682926829268, - "grad_norm": 2.1392667293548584, - "learning_rate": 1.9639932325607538e-06, - "loss": 0.048, - "step": 1167 - }, - { - "epoch": 5.697560975609756, - "grad_norm": 3.7127058506011963, - "learning_rate": 1.9602517463397845e-06, - "loss": 0.0302, - "step": 1168 - }, - { - "epoch": 5.702439024390244, - "grad_norm": 2.916168689727783, - "learning_rate": 1.9565115277215978e-06, - "loss": 0.0724, - "step": 1169 - }, - { - "epoch": 5.7073170731707314, - "grad_norm": 2.4352428913116455, - "learning_rate": 1.952772585490127e-06, - "loss": 0.0464, - "step": 1170 - }, - { - "epoch": 5.712195121951219, - "grad_norm": 2.8311455249786377, - "learning_rate": 1.9490349284263036e-06, - "loss": 0.0239, - "step": 1171 - }, - { - "epoch": 5.717073170731707, - "grad_norm": 3.3592801094055176, - "learning_rate": 1.9452985653080443e-06, - "loss": 0.0719, - "step": 1172 - }, - { - "epoch": 5.721951219512195, - "grad_norm": 2.450922966003418, - "learning_rate": 1.9415635049102245e-06, - "loss": 0.0408, - "step": 1173 - }, - { - "epoch": 5.726829268292683, - "grad_norm": 4.750118255615234, - "learning_rate": 1.937829756004662e-06, - "loss": 0.2049, - "step": 1174 - }, - { - "epoch": 5.7317073170731705, - "grad_norm": 3.0643811225891113, - "learning_rate": 1.9340973273600944e-06, - "loss": 0.0636, - "step": 1175 - }, - { - "epoch": 5.736585365853658, - "grad_norm": 3.313904047012329, - "learning_rate": 1.930366227742157e-06, - "loss": 0.1252, - "step": 1176 - }, - { - "epoch": 5.741463414634146, - "grad_norm": 3.8996808528900146, - "learning_rate": 1.9266364659133653e-06, - "loss": 0.0687, - "step": 1177 - }, - { - "epoch": 5.746341463414634, - "grad_norm": 2.727555274963379, - "learning_rate": 1.922908050633093e-06, - "loss": 0.0333, - "step": 1178 - }, - { - "epoch": 5.751219512195122, - "grad_norm": 3.270087718963623, - "learning_rate": 1.919180990657551e-06, - "loss": 0.0792, - "step": 1179 - }, - { - "epoch": 5.7560975609756095, - "grad_norm": 2.6631274223327637, - "learning_rate": 1.9154552947397668e-06, - "loss": 0.069, - "step": 1180 - }, - { - "epoch": 5.760975609756097, - "grad_norm": 4.4460554122924805, - "learning_rate": 1.9117309716295658e-06, - "loss": 0.115, - "step": 1181 - }, - { - "epoch": 5.765853658536585, - "grad_norm": 2.5652341842651367, - "learning_rate": 1.9080080300735478e-06, - "loss": 0.0537, - "step": 1182 - }, - { - "epoch": 5.770731707317073, - "grad_norm": 3.046436071395874, - "learning_rate": 1.9042864788150695e-06, - "loss": 0.0817, - "step": 1183 - }, - { - "epoch": 5.775609756097561, - "grad_norm": 2.121629238128662, - "learning_rate": 1.9005663265942206e-06, - "loss": 0.0289, - "step": 1184 - }, - { - "epoch": 5.780487804878049, - "grad_norm": 2.271918535232544, - "learning_rate": 1.8968475821478066e-06, - "loss": 0.0357, - "step": 1185 - }, - { - "epoch": 5.785365853658536, - "grad_norm": 2.582473039627075, - "learning_rate": 1.8931302542093274e-06, - "loss": 0.0584, - "step": 1186 - }, - { - "epoch": 5.790243902439024, - "grad_norm": 2.502952814102173, - "learning_rate": 1.8894143515089539e-06, - "loss": 0.0324, - "step": 1187 - }, - { - "epoch": 5.795121951219512, - "grad_norm": 1.9735453128814697, - "learning_rate": 1.8856998827735118e-06, - "loss": 0.0338, - "step": 1188 - }, - { - "epoch": 5.8, - "grad_norm": 4.441845893859863, - "learning_rate": 1.8819868567264588e-06, - "loss": 0.1706, - "step": 1189 - }, - { - "epoch": 5.804878048780488, - "grad_norm": 2.5450692176818848, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.0463, - "step": 1190 - }, - { - "epoch": 5.809756097560975, - "grad_norm": 3.718183755874634, - "learning_rate": 1.8745651675743876e-06, - "loss": 0.1188, - "step": 1191 - }, - { - "epoch": 5.814634146341463, - "grad_norm": 3.246532678604126, - "learning_rate": 1.870856521899261e-06, - "loss": 0.0984, - "step": 1192 - }, - { - "epoch": 5.819512195121951, - "grad_norm": 2.9522783756256104, - "learning_rate": 1.867149353772267e-06, - "loss": 0.0195, - "step": 1193 - }, - { - "epoch": 5.824390243902439, - "grad_norm": 2.3266429901123047, - "learning_rate": 1.863443671899717e-06, - "loss": 0.0236, - "step": 1194 - }, - { - "epoch": 5.829268292682927, - "grad_norm": 3.696749448776245, - "learning_rate": 1.8597394849844319e-06, - "loss": 0.1108, - "step": 1195 - }, - { - "epoch": 5.8341463414634145, - "grad_norm": 2.375624179840088, - "learning_rate": 1.8560368017257229e-06, - "loss": 0.0388, - "step": 1196 - }, - { - "epoch": 5.839024390243902, - "grad_norm": 4.0437092781066895, - "learning_rate": 1.8523356308193696e-06, - "loss": 0.3098, - "step": 1197 - }, - { - "epoch": 5.84390243902439, - "grad_norm": 3.165165424346924, - "learning_rate": 1.8486359809575977e-06, - "loss": 0.0775, - "step": 1198 - }, - { - "epoch": 5.848780487804878, - "grad_norm": 4.1991190910339355, - "learning_rate": 1.8449378608290638e-06, - "loss": 0.1222, - "step": 1199 - }, - { - "epoch": 5.853658536585366, - "grad_norm": 4.6657819747924805, - "learning_rate": 1.8412412791188306e-06, - "loss": 0.1146, - "step": 1200 - }, - { - "epoch": 5.8585365853658535, - "grad_norm": 4.569516181945801, - "learning_rate": 1.8375462445083464e-06, - "loss": 0.1113, - "step": 1201 - }, - { - "epoch": 5.863414634146341, - "grad_norm": 3.1565654277801514, - "learning_rate": 1.8338527656754285e-06, - "loss": 0.0416, - "step": 1202 - }, - { - "epoch": 5.868292682926829, - "grad_norm": 3.3474619388580322, - "learning_rate": 1.830160851294239e-06, - "loss": 0.0613, - "step": 1203 - }, - { - "epoch": 5.873170731707317, - "grad_norm": 4.30797004699707, - "learning_rate": 1.8264705100352662e-06, - "loss": 0.197, - "step": 1204 - }, - { - "epoch": 5.878048780487805, - "grad_norm": 2.7259573936462402, - "learning_rate": 1.8227817505653045e-06, - "loss": 0.0821, - "step": 1205 - }, - { - "epoch": 5.882926829268293, - "grad_norm": 3.515812873840332, - "learning_rate": 1.8190945815474323e-06, - "loss": 0.1246, - "step": 1206 - }, - { - "epoch": 5.88780487804878, - "grad_norm": 2.9223313331604004, - "learning_rate": 1.8154090116409934e-06, - "loss": 0.0703, - "step": 1207 - }, - { - "epoch": 5.892682926829268, - "grad_norm": 3.9529640674591064, - "learning_rate": 1.811725049501577e-06, - "loss": 0.1078, - "step": 1208 - }, - { - "epoch": 5.897560975609756, - "grad_norm": 4.1674580574035645, - "learning_rate": 1.8080427037809941e-06, - "loss": 0.1648, - "step": 1209 - }, - { - "epoch": 5.902439024390244, - "grad_norm": 3.1308021545410156, - "learning_rate": 1.8043619831272623e-06, - "loss": 0.061, - "step": 1210 - }, - { - "epoch": 5.907317073170732, - "grad_norm": 3.9667179584503174, - "learning_rate": 1.8006828961845807e-06, - "loss": 0.1863, - "step": 1211 - }, - { - "epoch": 5.912195121951219, - "grad_norm": 5.438168048858643, - "learning_rate": 1.7970054515933124e-06, - "loss": 0.2387, - "step": 1212 - }, - { - "epoch": 5.917073170731707, - "grad_norm": 5.505797863006592, - "learning_rate": 1.793329657989964e-06, - "loss": 0.2053, - "step": 1213 - }, - { - "epoch": 5.921951219512195, - "grad_norm": 2.8043150901794434, - "learning_rate": 1.7896555240071627e-06, - "loss": 0.026, - "step": 1214 - }, - { - "epoch": 5.926829268292683, - "grad_norm": 2.836164712905884, - "learning_rate": 1.7859830582736406e-06, - "loss": 0.0735, - "step": 1215 - }, - { - "epoch": 5.931707317073171, - "grad_norm": 2.8286306858062744, - "learning_rate": 1.782312269414211e-06, - "loss": 0.0586, - "step": 1216 - }, - { - "epoch": 5.9365853658536585, - "grad_norm": 4.4354329109191895, - "learning_rate": 1.7786431660497474e-06, - "loss": 0.3086, - "step": 1217 - }, - { - "epoch": 5.941463414634146, - "grad_norm": 4.0963640213012695, - "learning_rate": 1.7749757567971678e-06, - "loss": 0.0978, - "step": 1218 - }, - { - "epoch": 5.946341463414634, - "grad_norm": 2.726062536239624, - "learning_rate": 1.7713100502694091e-06, - "loss": 0.0976, - "step": 1219 - }, - { - "epoch": 5.951219512195122, - "grad_norm": 2.6566951274871826, - "learning_rate": 1.7676460550754104e-06, - "loss": 0.02, - "step": 1220 - }, - { - "epoch": 5.95609756097561, - "grad_norm": 2.7710952758789062, - "learning_rate": 1.7639837798200923e-06, - "loss": 0.0741, - "step": 1221 - }, - { - "epoch": 5.9609756097560975, - "grad_norm": 2.3678600788116455, - "learning_rate": 1.7603232331043346e-06, - "loss": 0.0542, - "step": 1222 - }, - { - "epoch": 5.965853658536585, - "grad_norm": 6.45259428024292, - "learning_rate": 1.7566644235249591e-06, - "loss": 0.3552, - "step": 1223 - }, - { - "epoch": 5.970731707317073, - "grad_norm": 1.8916475772857666, - "learning_rate": 1.7530073596747072e-06, - "loss": 0.0405, - "step": 1224 - }, - { - "epoch": 5.975609756097561, - "grad_norm": 2.1637566089630127, - "learning_rate": 1.74935205014222e-06, - "loss": 0.0178, - "step": 1225 - }, - { - "epoch": 5.980487804878049, - "grad_norm": 2.5959200859069824, - "learning_rate": 1.7456985035120194e-06, - "loss": 0.0264, - "step": 1226 - }, - { - "epoch": 5.985365853658537, - "grad_norm": 2.50264573097229, - "learning_rate": 1.7420467283644877e-06, - "loss": 0.0555, - "step": 1227 - }, - { - "epoch": 5.990243902439024, - "grad_norm": 2.4692020416259766, - "learning_rate": 1.738396733275844e-06, - "loss": 0.0546, - "step": 1228 - }, - { - "epoch": 5.995121951219512, - "grad_norm": 5.540846824645996, - "learning_rate": 1.7347485268181309e-06, - "loss": 0.1967, - "step": 1229 - }, - { - "epoch": 6.0, - "grad_norm": 1.8322839736938477, - "learning_rate": 1.7311021175591868e-06, - "loss": 0.0491, - "step": 1230 - }, - { - "epoch": 6.004878048780488, - "grad_norm": 2.719622850418091, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.0359, - "step": 1231 - }, - { - "epoch": 6.009756097560976, - "grad_norm": 2.859675884246826, - "learning_rate": 1.7238147248878444e-06, - "loss": 0.0585, - "step": 1232 - }, - { - "epoch": 6.014634146341463, - "grad_norm": 1.6761114597320557, - "learning_rate": 1.7201737585899415e-06, - "loss": 0.0188, - "step": 1233 - }, - { - "epoch": 6.019512195121951, - "grad_norm": 2.1588776111602783, - "learning_rate": 1.7165346237197594e-06, - "loss": 0.0484, - "step": 1234 - }, - { - "epoch": 6.024390243902439, - "grad_norm": 4.209983825683594, - "learning_rate": 1.7128973288238344e-06, - "loss": 0.0776, - "step": 1235 - }, - { - "epoch": 6.029268292682927, - "grad_norm": 2.3979365825653076, - "learning_rate": 1.709261882444379e-06, - "loss": 0.0338, - "step": 1236 - }, - { - "epoch": 6.034146341463415, - "grad_norm": 3.0030531883239746, - "learning_rate": 1.705628293119268e-06, - "loss": 0.0385, - "step": 1237 - }, - { - "epoch": 6.0390243902439025, - "grad_norm": 9.65616512298584, - "learning_rate": 1.701996569382011e-06, - "loss": 0.2601, - "step": 1238 - }, - { - "epoch": 6.04390243902439, - "grad_norm": 3.0590052604675293, - "learning_rate": 1.6983667197617386e-06, - "loss": 0.034, - "step": 1239 - }, - { - "epoch": 6.048780487804878, - "grad_norm": 3.6949822902679443, - "learning_rate": 1.6947387527831813e-06, - "loss": 0.0155, - "step": 1240 - }, - { - "epoch": 6.053658536585366, - "grad_norm": 1.2870460748672485, - "learning_rate": 1.6911126769666442e-06, - "loss": 0.0078, - "step": 1241 - }, - { - "epoch": 6.058536585365854, - "grad_norm": 4.307460784912109, - "learning_rate": 1.6874885008279945e-06, - "loss": 0.1429, - "step": 1242 - }, - { - "epoch": 6.0634146341463415, - "grad_norm": 2.334972858428955, - "learning_rate": 1.683866232878637e-06, - "loss": 0.0123, - "step": 1243 - }, - { - "epoch": 6.068292682926829, - "grad_norm": 2.4121835231781006, - "learning_rate": 1.6802458816254941e-06, - "loss": 0.0139, - "step": 1244 - }, - { - "epoch": 6.073170731707317, - "grad_norm": 1.9224514961242676, - "learning_rate": 1.676627455570988e-06, - "loss": 0.0312, - "step": 1245 - }, - { - "epoch": 6.078048780487805, - "grad_norm": 2.8293309211730957, - "learning_rate": 1.6730109632130199e-06, - "loss": 0.0464, - "step": 1246 - }, - { - "epoch": 6.082926829268293, - "grad_norm": 1.6368179321289062, - "learning_rate": 1.6693964130449472e-06, - "loss": 0.0085, - "step": 1247 - }, - { - "epoch": 6.087804878048781, - "grad_norm": 2.5535073280334473, - "learning_rate": 1.6657838135555696e-06, - "loss": 0.0482, - "step": 1248 - }, - { - "epoch": 6.092682926829268, - "grad_norm": 3.7743096351623535, - "learning_rate": 1.6621731732291024e-06, - "loss": 0.0235, - "step": 1249 - }, - { - "epoch": 6.097560975609756, - "grad_norm": 2.9921820163726807, - "learning_rate": 1.6585645005451623e-06, - "loss": 0.0455, - "step": 1250 - }, - { - "epoch": 6.102439024390244, - "grad_norm": 2.369581937789917, - "learning_rate": 1.6549578039787436e-06, - "loss": 0.0499, - "step": 1251 - }, - { - "epoch": 6.107317073170732, - "grad_norm": 2.163815498352051, - "learning_rate": 1.6513530920001998e-06, - "loss": 0.0118, - "step": 1252 - }, - { - "epoch": 6.11219512195122, - "grad_norm": 2.034928560256958, - "learning_rate": 1.6477503730752237e-06, - "loss": 0.0189, - "step": 1253 - }, - { - "epoch": 6.117073170731707, - "grad_norm": 2.7306160926818848, - "learning_rate": 1.6441496556648278e-06, - "loss": 0.0492, - "step": 1254 - }, - { - "epoch": 6.121951219512195, - "grad_norm": 3.7521040439605713, - "learning_rate": 1.6405509482253234e-06, - "loss": 0.1717, - "step": 1255 - }, - { - "epoch": 6.126829268292683, - "grad_norm": 1.8965831995010376, - "learning_rate": 1.636954259208302e-06, - "loss": 0.0194, - "step": 1256 - }, - { - "epoch": 6.131707317073171, - "grad_norm": 3.010024070739746, - "learning_rate": 1.6333595970606143e-06, - "loss": 0.0334, - "step": 1257 - }, - { - "epoch": 6.136585365853659, - "grad_norm": 3.7091450691223145, - "learning_rate": 1.62976697022435e-06, - "loss": 0.0705, - "step": 1258 - }, - { - "epoch": 6.1414634146341465, - "grad_norm": 3.5719785690307617, - "learning_rate": 1.6261763871368225e-06, - "loss": 0.0322, - "step": 1259 - }, - { - "epoch": 6.146341463414634, - "grad_norm": 3.3224213123321533, - "learning_rate": 1.6225878562305403e-06, - "loss": 0.0653, - "step": 1260 - }, - { - "epoch": 6.151219512195122, - "grad_norm": 3.78924822807312, - "learning_rate": 1.6190013859331958e-06, - "loss": 0.0557, - "step": 1261 - }, - { - "epoch": 6.15609756097561, - "grad_norm": 2.429412841796875, - "learning_rate": 1.6154169846676415e-06, - "loss": 0.0277, - "step": 1262 - }, - { - "epoch": 6.160975609756098, - "grad_norm": 2.626167058944702, - "learning_rate": 1.6118346608518698e-06, - "loss": 0.0305, - "step": 1263 - }, - { - "epoch": 6.1658536585365855, - "grad_norm": 2.44846248626709, - "learning_rate": 1.6082544228989958e-06, - "loss": 0.0093, - "step": 1264 - }, - { - "epoch": 6.170731707317073, - "grad_norm": 2.9345643520355225, - "learning_rate": 1.6046762792172336e-06, - "loss": 0.0198, - "step": 1265 - }, - { - "epoch": 6.175609756097561, - "grad_norm": 3.224313497543335, - "learning_rate": 1.6011002382098806e-06, - "loss": 0.0673, - "step": 1266 - }, - { - "epoch": 6.180487804878049, - "grad_norm": 1.9066869020462036, - "learning_rate": 1.5975263082752968e-06, - "loss": 0.0115, - "step": 1267 - }, - { - "epoch": 6.185365853658537, - "grad_norm": 2.7153308391571045, - "learning_rate": 1.5939544978068816e-06, - "loss": 0.0529, - "step": 1268 - }, - { - "epoch": 6.190243902439025, - "grad_norm": 2.2173709869384766, - "learning_rate": 1.590384815193059e-06, - "loss": 0.0643, - "step": 1269 - }, - { - "epoch": 6.195121951219512, - "grad_norm": 3.1238555908203125, - "learning_rate": 1.5868172688172559e-06, - "loss": 0.064, - "step": 1270 - }, - { - "epoch": 6.2, - "grad_norm": 2.7765870094299316, - "learning_rate": 1.5832518670578802e-06, - "loss": 0.0676, - "step": 1271 - }, - { - "epoch": 6.204878048780488, - "grad_norm": 2.9892525672912598, - "learning_rate": 1.5796886182883053e-06, - "loss": 0.074, - "step": 1272 - }, - { - "epoch": 6.209756097560976, - "grad_norm": 2.0955512523651123, - "learning_rate": 1.5761275308768476e-06, - "loss": 0.0311, - "step": 1273 - }, - { - "epoch": 6.214634146341464, - "grad_norm": 1.8085861206054688, - "learning_rate": 1.5725686131867462e-06, - "loss": 0.0108, - "step": 1274 - }, - { - "epoch": 6.219512195121951, - "grad_norm": 3.026421308517456, - "learning_rate": 1.569011873576147e-06, - "loss": 0.0464, - "step": 1275 - }, - { - "epoch": 6.224390243902439, - "grad_norm": 2.3395111560821533, - "learning_rate": 1.5654573203980782e-06, - "loss": 0.0221, - "step": 1276 - }, - { - "epoch": 6.229268292682927, - "grad_norm": 3.6158692836761475, - "learning_rate": 1.5619049620004354e-06, - "loss": 0.0693, - "step": 1277 - }, - { - "epoch": 6.234146341463415, - "grad_norm": 1.6186567544937134, - "learning_rate": 1.5583548067259584e-06, - "loss": 0.0198, - "step": 1278 - }, - { - "epoch": 6.239024390243903, - "grad_norm": 2.7193195819854736, - "learning_rate": 1.5548068629122126e-06, - "loss": 0.0687, - "step": 1279 - }, - { - "epoch": 6.2439024390243905, - "grad_norm": 2.7472658157348633, - "learning_rate": 1.5512611388915711e-06, - "loss": 0.053, - "step": 1280 - }, - { - "epoch": 6.248780487804878, - "grad_norm": 4.694706439971924, - "learning_rate": 1.5477176429911934e-06, - "loss": 0.2076, - "step": 1281 - }, - { - "epoch": 6.253658536585366, - "grad_norm": 1.609309434890747, - "learning_rate": 1.5441763835330048e-06, - "loss": 0.0108, - "step": 1282 - }, - { - "epoch": 6.258536585365854, - "grad_norm": 1.7064504623413086, - "learning_rate": 1.5406373688336807e-06, - "loss": 0.0114, - "step": 1283 - }, - { - "epoch": 6.263414634146342, - "grad_norm": 1.967726469039917, - "learning_rate": 1.5371006072046225e-06, - "loss": 0.0209, - "step": 1284 - }, - { - "epoch": 6.2682926829268295, - "grad_norm": 2.4065544605255127, - "learning_rate": 1.5335661069519408e-06, - "loss": 0.0741, - "step": 1285 - }, - { - "epoch": 6.273170731707317, - "grad_norm": 2.2167603969573975, - "learning_rate": 1.5300338763764371e-06, - "loss": 0.0121, - "step": 1286 - }, - { - "epoch": 6.278048780487805, - "grad_norm": 3.229228973388672, - "learning_rate": 1.5265039237735804e-06, - "loss": 0.0226, - "step": 1287 - }, - { - "epoch": 6.282926829268293, - "grad_norm": 1.889419674873352, - "learning_rate": 1.5229762574334903e-06, - "loss": 0.0116, - "step": 1288 - }, - { - "epoch": 6.287804878048781, - "grad_norm": 3.7595815658569336, - "learning_rate": 1.5194508856409181e-06, - "loss": 0.0775, - "step": 1289 - }, - { - "epoch": 6.2926829268292686, - "grad_norm": 2.527560234069824, - "learning_rate": 1.515927816675225e-06, - "loss": 0.0355, - "step": 1290 - }, - { - "epoch": 6.297560975609756, - "grad_norm": 1.9718955755233765, - "learning_rate": 1.5124070588103648e-06, - "loss": 0.0127, - "step": 1291 - }, - { - "epoch": 6.302439024390244, - "grad_norm": 1.9010120630264282, - "learning_rate": 1.5088886203148643e-06, - "loss": 0.0188, - "step": 1292 - }, - { - "epoch": 6.307317073170732, - "grad_norm": 3.2093472480773926, - "learning_rate": 1.505372509451801e-06, - "loss": 0.0845, - "step": 1293 - }, - { - "epoch": 6.31219512195122, - "grad_norm": 1.6723257303237915, - "learning_rate": 1.5018587344787888e-06, - "loss": 0.0265, - "step": 1294 - }, - { - "epoch": 6.317073170731708, - "grad_norm": 3.246812343597412, - "learning_rate": 1.498347303647953e-06, - "loss": 0.0833, - "step": 1295 - }, - { - "epoch": 6.321951219512195, - "grad_norm": 2.887834072113037, - "learning_rate": 1.4948382252059158e-06, - "loss": 0.0416, - "step": 1296 - }, - { - "epoch": 6.326829268292683, - "grad_norm": 2.5762557983398438, - "learning_rate": 1.4913315073937742e-06, - "loss": 0.0614, - "step": 1297 - }, - { - "epoch": 6.331707317073171, - "grad_norm": 3.3746497631073, - "learning_rate": 1.4878271584470805e-06, - "loss": 0.0601, - "step": 1298 - }, - { - "epoch": 6.336585365853659, - "grad_norm": 2.4984664916992188, - "learning_rate": 1.4843251865958242e-06, - "loss": 0.0189, - "step": 1299 - }, - { - "epoch": 6.341463414634147, - "grad_norm": 3.178300619125366, - "learning_rate": 1.4808256000644128e-06, - "loss": 0.038, - "step": 1300 - }, - { - "epoch": 6.3463414634146345, - "grad_norm": 2.6362273693084717, - "learning_rate": 1.4773284070716504e-06, - "loss": 0.041, - "step": 1301 - }, - { - "epoch": 6.351219512195122, - "grad_norm": 2.1512129306793213, - "learning_rate": 1.473833615830722e-06, - "loss": 0.0227, - "step": 1302 - }, - { - "epoch": 6.35609756097561, - "grad_norm": 2.2898178100585938, - "learning_rate": 1.4703412345491692e-06, - "loss": 0.039, - "step": 1303 - }, - { - "epoch": 6.360975609756098, - "grad_norm": 2.6641080379486084, - "learning_rate": 1.4668512714288763e-06, - "loss": 0.0431, - "step": 1304 - }, - { - "epoch": 6.365853658536586, - "grad_norm": 1.7466667890548706, - "learning_rate": 1.4633637346660478e-06, - "loss": 0.013, - "step": 1305 - }, - { - "epoch": 6.3707317073170735, - "grad_norm": 2.437889575958252, - "learning_rate": 1.4598786324511892e-06, - "loss": 0.0181, - "step": 1306 - }, - { - "epoch": 6.375609756097561, - "grad_norm": 2.5054142475128174, - "learning_rate": 1.456395972969089e-06, - "loss": 0.0248, - "step": 1307 - }, - { - "epoch": 6.380487804878049, - "grad_norm": 3.2294511795043945, - "learning_rate": 1.4529157643987995e-06, - "loss": 0.0561, - "step": 1308 - }, - { - "epoch": 6.385365853658537, - "grad_norm": 2.260188341140747, - "learning_rate": 1.4494380149136162e-06, - "loss": 0.0593, - "step": 1309 - }, - { - "epoch": 6.390243902439025, - "grad_norm": 2.4961163997650146, - "learning_rate": 1.4459627326810576e-06, - "loss": 0.0257, - "step": 1310 - }, - { - "epoch": 6.3951219512195125, - "grad_norm": 3.4153239727020264, - "learning_rate": 1.4424899258628533e-06, - "loss": 0.0223, - "step": 1311 - }, - { - "epoch": 6.4, - "grad_norm": 2.6308839321136475, - "learning_rate": 1.439019602614914e-06, - "loss": 0.0112, - "step": 1312 - }, - { - "epoch": 6.404878048780488, - "grad_norm": 2.754530191421509, - "learning_rate": 1.4355517710873184e-06, - "loss": 0.068, - "step": 1313 - }, - { - "epoch": 6.409756097560976, - "grad_norm": 4.473151683807373, - "learning_rate": 1.432086439424297e-06, - "loss": 0.0825, - "step": 1314 - }, - { - "epoch": 6.414634146341464, - "grad_norm": 4.85701322555542, - "learning_rate": 1.428623615764206e-06, - "loss": 0.1812, - "step": 1315 - }, - { - "epoch": 6.419512195121952, - "grad_norm": 1.6678224802017212, - "learning_rate": 1.4251633082395117e-06, - "loss": 0.0207, - "step": 1316 - }, - { - "epoch": 6.424390243902439, - "grad_norm": 2.9730937480926514, - "learning_rate": 1.4217055249767734e-06, - "loss": 0.0617, - "step": 1317 - }, - { - "epoch": 6.429268292682927, - "grad_norm": 2.503786563873291, - "learning_rate": 1.4182502740966203e-06, - "loss": 0.0137, - "step": 1318 - }, - { - "epoch": 6.434146341463415, - "grad_norm": 3.0798017978668213, - "learning_rate": 1.4147975637137334e-06, - "loss": 0.0329, - "step": 1319 - }, - { - "epoch": 6.439024390243903, - "grad_norm": 3.008155345916748, - "learning_rate": 1.411347401936831e-06, - "loss": 0.0487, - "step": 1320 - }, - { - "epoch": 6.443902439024391, - "grad_norm": 2.5451765060424805, - "learning_rate": 1.4078997968686425e-06, - "loss": 0.0582, - "step": 1321 - }, - { - "epoch": 6.4487804878048784, - "grad_norm": 2.042696475982666, - "learning_rate": 1.404454756605893e-06, - "loss": 0.0336, - "step": 1322 - }, - { - "epoch": 6.453658536585366, - "grad_norm": 3.0421411991119385, - "learning_rate": 1.4010122892392872e-06, - "loss": 0.1372, - "step": 1323 - }, - { - "epoch": 6.458536585365854, - "grad_norm": 2.0793251991271973, - "learning_rate": 1.3975724028534842e-06, - "loss": 0.0452, - "step": 1324 - }, - { - "epoch": 6.463414634146342, - "grad_norm": 2.6149914264678955, - "learning_rate": 1.394135105527083e-06, - "loss": 0.0431, - "step": 1325 - }, - { - "epoch": 6.46829268292683, - "grad_norm": 2.818507671356201, - "learning_rate": 1.3907004053326006e-06, - "loss": 0.0242, - "step": 1326 - }, - { - "epoch": 6.473170731707317, - "grad_norm": 2.328993558883667, - "learning_rate": 1.387268310336458e-06, - "loss": 0.0293, - "step": 1327 - }, - { - "epoch": 6.478048780487805, - "grad_norm": 2.2032642364501953, - "learning_rate": 1.3838388285989552e-06, - "loss": 0.0232, - "step": 1328 - }, - { - "epoch": 6.482926829268292, - "grad_norm": 2.039983034133911, - "learning_rate": 1.380411968174254e-06, - "loss": 0.0256, - "step": 1329 - }, - { - "epoch": 6.487804878048781, - "grad_norm": 3.7261271476745605, - "learning_rate": 1.3769877371103635e-06, - "loss": 0.1285, - "step": 1330 - }, - { - "epoch": 6.492682926829268, - "grad_norm": 3.7156264781951904, - "learning_rate": 1.373566143449115e-06, - "loss": 0.1621, - "step": 1331 - }, - { - "epoch": 6.4975609756097565, - "grad_norm": 1.5905455350875854, - "learning_rate": 1.3701471952261457e-06, - "loss": 0.0126, - "step": 1332 - }, - { - "epoch": 6.5024390243902435, - "grad_norm": 2.8808465003967285, - "learning_rate": 1.3667309004708832e-06, - "loss": 0.0211, - "step": 1333 - }, - { - "epoch": 6.507317073170732, - "grad_norm": 3.9190757274627686, - "learning_rate": 1.3633172672065195e-06, - "loss": 0.062, - "step": 1334 - }, - { - "epoch": 6.512195121951219, - "grad_norm": 1.6948635578155518, - "learning_rate": 1.359906303449997e-06, - "loss": 0.0126, - "step": 1335 - }, - { - "epoch": 6.517073170731708, - "grad_norm": 2.3967642784118652, - "learning_rate": 1.3564980172119913e-06, - "loss": 0.0111, - "step": 1336 - }, - { - "epoch": 6.521951219512195, - "grad_norm": 3.5275399684906006, - "learning_rate": 1.3530924164968873e-06, - "loss": 0.1024, - "step": 1337 - }, - { - "epoch": 6.526829268292683, - "grad_norm": 2.0768814086914062, - "learning_rate": 1.3496895093027617e-06, - "loss": 0.0254, - "step": 1338 - }, - { - "epoch": 6.53170731707317, - "grad_norm": 1.8964029550552368, - "learning_rate": 1.3462893036213706e-06, - "loss": 0.0188, - "step": 1339 - }, - { - "epoch": 6.536585365853659, - "grad_norm": 1.679545283317566, - "learning_rate": 1.3428918074381203e-06, - "loss": 0.0195, - "step": 1340 - }, - { - "epoch": 6.541463414634146, - "grad_norm": 2.204637050628662, - "learning_rate": 1.3394970287320553e-06, - "loss": 0.0317, - "step": 1341 - }, - { - "epoch": 6.546341463414635, - "grad_norm": 2.014052629470825, - "learning_rate": 1.3361049754758404e-06, - "loss": 0.0191, - "step": 1342 - }, - { - "epoch": 6.5512195121951216, - "grad_norm": 1.4630589485168457, - "learning_rate": 1.3327156556357369e-06, - "loss": 0.0079, - "step": 1343 - }, - { - "epoch": 6.55609756097561, - "grad_norm": 2.876132011413574, - "learning_rate": 1.3293290771715875e-06, - "loss": 0.0345, - "step": 1344 - }, - { - "epoch": 6.560975609756097, - "grad_norm": 1.793338656425476, - "learning_rate": 1.3259452480367963e-06, - "loss": 0.0409, - "step": 1345 - }, - { - "epoch": 6.565853658536585, - "grad_norm": 2.2791552543640137, - "learning_rate": 1.3225641761783126e-06, - "loss": 0.0494, - "step": 1346 - }, - { - "epoch": 6.570731707317073, - "grad_norm": 4.255206108093262, - "learning_rate": 1.3191858695366084e-06, - "loss": 0.0842, - "step": 1347 - }, - { - "epoch": 6.575609756097561, - "grad_norm": 2.449460506439209, - "learning_rate": 1.3158103360456603e-06, - "loss": 0.0399, - "step": 1348 - }, - { - "epoch": 6.580487804878048, - "grad_norm": 2.780730724334717, - "learning_rate": 1.3124375836329362e-06, - "loss": 0.0272, - "step": 1349 - }, - { - "epoch": 6.585365853658536, - "grad_norm": 1.925681233406067, - "learning_rate": 1.3090676202193692e-06, - "loss": 0.007, - "step": 1350 - }, - { - "epoch": 6.590243902439024, - "grad_norm": 2.069791555404663, - "learning_rate": 1.3057004537193424e-06, - "loss": 0.016, - "step": 1351 - }, - { - "epoch": 6.595121951219512, - "grad_norm": 1.863872766494751, - "learning_rate": 1.302336092040673e-06, - "loss": 0.016, - "step": 1352 - }, - { - "epoch": 6.6, - "grad_norm": 2.351259231567383, - "learning_rate": 1.298974543084589e-06, - "loss": 0.0172, - "step": 1353 - }, - { - "epoch": 6.6048780487804875, - "grad_norm": 1.848115086555481, - "learning_rate": 1.2956158147457116e-06, - "loss": 0.0412, - "step": 1354 - }, - { - "epoch": 6.609756097560975, - "grad_norm": 1.6395928859710693, - "learning_rate": 1.2922599149120412e-06, - "loss": 0.0181, - "step": 1355 - }, - { - "epoch": 6.614634146341463, - "grad_norm": 2.1267426013946533, - "learning_rate": 1.2889068514649328e-06, - "loss": 0.04, - "step": 1356 - }, - { - "epoch": 6.619512195121951, - "grad_norm": 1.6603496074676514, - "learning_rate": 1.2855566322790796e-06, - "loss": 0.0108, - "step": 1357 - }, - { - "epoch": 6.624390243902439, - "grad_norm": 2.2724838256835938, - "learning_rate": 1.2822092652224989e-06, - "loss": 0.0284, - "step": 1358 - }, - { - "epoch": 6.6292682926829265, - "grad_norm": 2.222623825073242, - "learning_rate": 1.2788647581565048e-06, - "loss": 0.0128, - "step": 1359 - }, - { - "epoch": 6.634146341463414, - "grad_norm": 2.710681676864624, - "learning_rate": 1.275523118935697e-06, - "loss": 0.0184, - "step": 1360 - }, - { - "epoch": 6.639024390243902, - "grad_norm": 2.354264736175537, - "learning_rate": 1.2721843554079418e-06, - "loss": 0.0313, - "step": 1361 - }, - { - "epoch": 6.64390243902439, - "grad_norm": 3.886909008026123, - "learning_rate": 1.2688484754143493e-06, - "loss": 0.1184, - "step": 1362 - }, - { - "epoch": 6.648780487804878, - "grad_norm": 3.088468313217163, - "learning_rate": 1.2655154867892577e-06, - "loss": 0.0353, - "step": 1363 - }, - { - "epoch": 6.6536585365853655, - "grad_norm": 2.987576484680176, - "learning_rate": 1.2621853973602158e-06, - "loss": 0.0349, - "step": 1364 - }, - { - "epoch": 6.658536585365853, - "grad_norm": 1.719212293624878, - "learning_rate": 1.2588582149479645e-06, - "loss": 0.0081, - "step": 1365 - }, - { - "epoch": 6.663414634146341, - "grad_norm": 2.1641178131103516, - "learning_rate": 1.2555339473664151e-06, - "loss": 0.0279, - "step": 1366 - }, - { - "epoch": 6.668292682926829, - "grad_norm": 2.9424984455108643, - "learning_rate": 1.2522126024226347e-06, - "loss": 0.0492, - "step": 1367 - }, - { - "epoch": 6.673170731707317, - "grad_norm": 1.961077332496643, - "learning_rate": 1.2488941879168278e-06, - "loss": 0.0084, - "step": 1368 - }, - { - "epoch": 6.678048780487805, - "grad_norm": 2.302565097808838, - "learning_rate": 1.2455787116423148e-06, - "loss": 0.0486, - "step": 1369 - }, - { - "epoch": 6.682926829268292, - "grad_norm": 2.187194347381592, - "learning_rate": 1.2422661813855158e-06, - "loss": 0.0319, - "step": 1370 - }, - { - "epoch": 6.68780487804878, - "grad_norm": 2.0076377391815186, - "learning_rate": 1.238956604925934e-06, - "loss": 0.016, - "step": 1371 - }, - { - "epoch": 6.692682926829268, - "grad_norm": 4.137681484222412, - "learning_rate": 1.2356499900361333e-06, - "loss": 0.0557, - "step": 1372 - }, - { - "epoch": 6.697560975609756, - "grad_norm": 2.0039637088775635, - "learning_rate": 1.2323463444817227e-06, - "loss": 0.0219, - "step": 1373 - }, - { - "epoch": 6.702439024390244, - "grad_norm": 2.943314552307129, - "learning_rate": 1.2290456760213405e-06, - "loss": 0.0849, - "step": 1374 - }, - { - "epoch": 6.7073170731707314, - "grad_norm": 2.715120553970337, - "learning_rate": 1.2257479924066296e-06, - "loss": 0.0857, - "step": 1375 - }, - { - "epoch": 6.712195121951219, - "grad_norm": 3.144104480743408, - "learning_rate": 1.2224533013822237e-06, - "loss": 0.0648, - "step": 1376 - }, - { - "epoch": 6.717073170731707, - "grad_norm": 2.830066680908203, - "learning_rate": 1.2191616106857312e-06, - "loss": 0.0426, - "step": 1377 - }, - { - "epoch": 6.721951219512195, - "grad_norm": 3.1005899906158447, - "learning_rate": 1.2158729280477112e-06, - "loss": 0.0478, - "step": 1378 - }, - { - "epoch": 6.726829268292683, - "grad_norm": 2.2102460861206055, - "learning_rate": 1.2125872611916578e-06, - "loss": 0.0273, - "step": 1379 - }, - { - "epoch": 6.7317073170731705, - "grad_norm": 2.860288619995117, - "learning_rate": 1.2093046178339869e-06, - "loss": 0.0201, - "step": 1380 - }, - { - "epoch": 6.736585365853658, - "grad_norm": 1.5914067029953003, - "learning_rate": 1.206025005684009e-06, - "loss": 0.0148, - "step": 1381 - }, - { - "epoch": 6.741463414634146, - "grad_norm": 1.8609223365783691, - "learning_rate": 1.202748432443918e-06, - "loss": 0.0073, - "step": 1382 - }, - { - "epoch": 6.746341463414634, - "grad_norm": 3.0532407760620117, - "learning_rate": 1.1994749058087695e-06, - "loss": 0.0344, - "step": 1383 - }, - { - "epoch": 6.751219512195122, - "grad_norm": 4.0601677894592285, - "learning_rate": 1.196204433466467e-06, - "loss": 0.0837, - "step": 1384 - }, - { - "epoch": 6.7560975609756095, - "grad_norm": 2.6982672214508057, - "learning_rate": 1.192937023097738e-06, - "loss": 0.0425, - "step": 1385 - }, - { - "epoch": 6.760975609756097, - "grad_norm": 1.431360125541687, - "learning_rate": 1.1896726823761195e-06, - "loss": 0.0065, - "step": 1386 - }, - { - "epoch": 6.765853658536585, - "grad_norm": 2.116907835006714, - "learning_rate": 1.1864114189679413e-06, - "loss": 0.0133, - "step": 1387 - }, - { - "epoch": 6.770731707317073, - "grad_norm": 2.6869874000549316, - "learning_rate": 1.183153240532304e-06, - "loss": 0.0188, - "step": 1388 - }, - { - "epoch": 6.775609756097561, - "grad_norm": 2.0294089317321777, - "learning_rate": 1.179898154721063e-06, - "loss": 0.0234, - "step": 1389 - }, - { - "epoch": 6.780487804878049, - "grad_norm": 2.3081958293914795, - "learning_rate": 1.1766461691788137e-06, - "loss": 0.0208, - "step": 1390 - }, - { - "epoch": 6.785365853658536, - "grad_norm": 3.4795000553131104, - "learning_rate": 1.1733972915428665e-06, - "loss": 0.0728, - "step": 1391 - }, - { - "epoch": 6.790243902439024, - "grad_norm": 2.5121219158172607, - "learning_rate": 1.1701515294432348e-06, - "loss": 0.0291, - "step": 1392 - }, - { - "epoch": 6.795121951219512, - "grad_norm": 5.1100172996521, - "learning_rate": 1.1669088905026156e-06, - "loss": 0.0988, - "step": 1393 - }, - { - "epoch": 6.8, - "grad_norm": 2.5434396266937256, - "learning_rate": 1.163669382336371e-06, - "loss": 0.0399, - "step": 1394 - }, - { - "epoch": 6.804878048780488, - "grad_norm": 2.7811660766601562, - "learning_rate": 1.160433012552508e-06, - "loss": 0.0134, - "step": 1395 - }, - { - "epoch": 6.809756097560975, - "grad_norm": 3.2409870624542236, - "learning_rate": 1.1571997887516672e-06, - "loss": 0.0795, - "step": 1396 - }, - { - "epoch": 6.814634146341463, - "grad_norm": 2.5300986766815186, - "learning_rate": 1.1539697185270982e-06, - "loss": 0.0329, - "step": 1397 - }, - { - "epoch": 6.819512195121951, - "grad_norm": 1.8510549068450928, - "learning_rate": 1.1507428094646448e-06, - "loss": 0.0213, - "step": 1398 - }, - { - "epoch": 6.824390243902439, - "grad_norm": 1.8820618391036987, - "learning_rate": 1.1475190691427255e-06, - "loss": 0.0172, - "step": 1399 - }, - { - "epoch": 6.829268292682927, - "grad_norm": 1.3415460586547852, - "learning_rate": 1.1442985051323205e-06, - "loss": 0.0029, - "step": 1400 - }, - { - "epoch": 6.8341463414634145, - "grad_norm": 6.033786296844482, - "learning_rate": 1.1410811249969475e-06, - "loss": 0.1638, - "step": 1401 - }, - { - "epoch": 6.839024390243902, - "grad_norm": 2.990328311920166, - "learning_rate": 1.1378669362926468e-06, - "loss": 0.0779, - "step": 1402 - }, - { - "epoch": 6.84390243902439, - "grad_norm": 3.2766308784484863, - "learning_rate": 1.1346559465679656e-06, - "loss": 0.0528, - "step": 1403 - }, - { - "epoch": 6.848780487804878, - "grad_norm": 1.266032338142395, - "learning_rate": 1.1314481633639374e-06, - "loss": 0.0057, - "step": 1404 - }, - { - "epoch": 6.853658536585366, - "grad_norm": 3.1048431396484375, - "learning_rate": 1.1282435942140632e-06, - "loss": 0.1772, - "step": 1405 - }, - { - "epoch": 6.8585365853658535, - "grad_norm": 2.264822483062744, - "learning_rate": 1.1250422466442992e-06, - "loss": 0.0176, - "step": 1406 - }, - { - "epoch": 6.863414634146341, - "grad_norm": 2.0890846252441406, - "learning_rate": 1.1218441281730334e-06, - "loss": 0.0184, - "step": 1407 - }, - { - "epoch": 6.868292682926829, - "grad_norm": 1.8351202011108398, - "learning_rate": 1.1186492463110696e-06, - "loss": 0.0127, - "step": 1408 - }, - { - "epoch": 6.873170731707317, - "grad_norm": 1.447196125984192, - "learning_rate": 1.1154576085616135e-06, - "loss": 0.0094, - "step": 1409 - }, - { - "epoch": 6.878048780487805, - "grad_norm": 1.6414039134979248, - "learning_rate": 1.1122692224202491e-06, - "loss": 0.0138, - "step": 1410 - }, - { - "epoch": 6.882926829268293, - "grad_norm": 2.87068772315979, - "learning_rate": 1.1090840953749253e-06, - "loss": 0.0821, - "step": 1411 - }, - { - "epoch": 6.88780487804878, - "grad_norm": 2.0476415157318115, - "learning_rate": 1.1059022349059362e-06, - "loss": 0.0222, - "step": 1412 - }, - { - "epoch": 6.892682926829268, - "grad_norm": 4.169386863708496, - "learning_rate": 1.102723648485905e-06, - "loss": 0.1183, - "step": 1413 - }, - { - "epoch": 6.897560975609756, - "grad_norm": 4.47883415222168, - "learning_rate": 1.0995483435797643e-06, - "loss": 0.0528, - "step": 1414 - }, - { - "epoch": 6.902439024390244, - "grad_norm": 2.0025508403778076, - "learning_rate": 1.0963763276447435e-06, - "loss": 0.0106, - "step": 1415 - }, - { - "epoch": 6.907317073170732, - "grad_norm": 2.4212136268615723, - "learning_rate": 1.0932076081303442e-06, - "loss": 0.0454, - "step": 1416 - }, - { - "epoch": 6.912195121951219, - "grad_norm": 1.7873961925506592, - "learning_rate": 1.0900421924783272e-06, - "loss": 0.022, - "step": 1417 - }, - { - "epoch": 6.917073170731707, - "grad_norm": 2.0345218181610107, - "learning_rate": 1.0868800881226962e-06, - "loss": 0.0261, - "step": 1418 - }, - { - "epoch": 6.921951219512195, - "grad_norm": 3.086538314819336, - "learning_rate": 1.0837213024896764e-06, - "loss": 0.0257, - "step": 1419 - }, - { - "epoch": 6.926829268292683, - "grad_norm": 2.9401397705078125, - "learning_rate": 1.080565842997698e-06, - "loss": 0.087, - "step": 1420 - }, - { - "epoch": 6.931707317073171, - "grad_norm": 1.305415153503418, - "learning_rate": 1.0774137170573826e-06, - "loss": 0.0147, - "step": 1421 - }, - { - "epoch": 6.9365853658536585, - "grad_norm": 3.0256683826446533, - "learning_rate": 1.074264932071521e-06, - "loss": 0.1183, - "step": 1422 - }, - { - "epoch": 6.941463414634146, - "grad_norm": 2.3618743419647217, - "learning_rate": 1.0711194954350568e-06, - "loss": 0.0186, - "step": 1423 - }, - { - "epoch": 6.946341463414634, - "grad_norm": 2.004451036453247, - "learning_rate": 1.0679774145350735e-06, - "loss": 0.0222, - "step": 1424 - }, - { - "epoch": 6.951219512195122, - "grad_norm": 3.089723587036133, - "learning_rate": 1.0648386967507703e-06, - "loss": 0.0824, - "step": 1425 - }, - { - "epoch": 6.95609756097561, - "grad_norm": 1.9310235977172852, - "learning_rate": 1.0617033494534486e-06, - "loss": 0.0247, - "step": 1426 - }, - { - "epoch": 6.9609756097560975, - "grad_norm": 1.973836898803711, - "learning_rate": 1.0585713800064964e-06, - "loss": 0.0142, - "step": 1427 - }, - { - "epoch": 6.965853658536585, - "grad_norm": 2.9914112091064453, - "learning_rate": 1.0554427957653663e-06, - "loss": 0.0681, - "step": 1428 - }, - { - "epoch": 6.970731707317073, - "grad_norm": 3.356689691543579, - "learning_rate": 1.0523176040775615e-06, - "loss": 0.0916, - "step": 1429 - }, - { - "epoch": 6.975609756097561, - "grad_norm": 2.3305246829986572, - "learning_rate": 1.0491958122826173e-06, - "loss": 0.0611, - "step": 1430 - }, - { - "epoch": 6.980487804878049, - "grad_norm": 1.7383835315704346, - "learning_rate": 1.0460774277120866e-06, - "loss": 0.0182, - "step": 1431 - }, - { - "epoch": 6.985365853658537, - "grad_norm": 2.585674524307251, - "learning_rate": 1.0429624576895177e-06, - "loss": 0.0084, - "step": 1432 - }, - { - "epoch": 6.990243902439024, - "grad_norm": 3.023864269256592, - "learning_rate": 1.03985090953044e-06, - "loss": 0.0411, - "step": 1433 - }, - { - "epoch": 6.995121951219512, - "grad_norm": 2.281674861907959, - "learning_rate": 1.0367427905423497e-06, - "loss": 0.0464, - "step": 1434 - }, - { - "epoch": 7.0, - "grad_norm": 1.4372339248657227, - "learning_rate": 1.0336381080246858e-06, - "loss": 0.0124, - "step": 1435 - }, - { - "epoch": 7.004878048780488, - "grad_norm": 1.9526969194412231, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.0179, - "step": 1436 - }, - { - "epoch": 7.009756097560976, - "grad_norm": 1.7297903299331665, - "learning_rate": 1.027439081558029e-06, - "loss": 0.0119, - "step": 1437 - }, - { - "epoch": 7.014634146341463, - "grad_norm": 2.2754275798797607, - "learning_rate": 1.0243447521674967e-06, - "loss": 0.0278, - "step": 1438 - }, - { - "epoch": 7.019512195121951, - "grad_norm": 5.485769271850586, - "learning_rate": 1.021253888364276e-06, - "loss": 0.1259, - "step": 1439 - }, - { - "epoch": 7.024390243902439, - "grad_norm": 0.9085121750831604, - "learning_rate": 1.018166497407284e-06, - "loss": 0.0047, - "step": 1440 - }, - { - "epoch": 7.029268292682927, - "grad_norm": 1.0291047096252441, - "learning_rate": 1.0150825865472813e-06, - "loss": 0.0044, - "step": 1441 - }, - { - "epoch": 7.034146341463415, - "grad_norm": 0.8040009140968323, - "learning_rate": 1.0120021630268542e-06, - "loss": 0.0044, - "step": 1442 - }, - { - "epoch": 7.0390243902439025, - "grad_norm": 1.3701342344284058, - "learning_rate": 1.0089252340804025e-06, - "loss": 0.0081, - "step": 1443 - }, - { - "epoch": 7.04390243902439, - "grad_norm": 2.89591646194458, - "learning_rate": 1.0058518069341152e-06, - "loss": 0.0318, - "step": 1444 - }, - { - "epoch": 7.048780487804878, - "grad_norm": 1.3153692483901978, - "learning_rate": 1.002781888805958e-06, - "loss": 0.0067, - "step": 1445 - }, - { - "epoch": 7.053658536585366, - "grad_norm": 1.4490022659301758, - "learning_rate": 9.997154869056588e-07, - "loss": 0.0064, - "step": 1446 - }, - { - "epoch": 7.058536585365854, - "grad_norm": 1.7938638925552368, - "learning_rate": 9.966526084346837e-07, - "loss": 0.0057, - "step": 1447 - }, - { - "epoch": 7.0634146341463415, - "grad_norm": 3.7182836532592773, - "learning_rate": 9.935932605862258e-07, - "loss": 0.0365, - "step": 1448 - }, - { - "epoch": 7.068292682926829, - "grad_norm": 1.7843579053878784, - "learning_rate": 9.905374505451853e-07, - "loss": 0.0345, - "step": 1449 - }, - { - "epoch": 7.073170731707317, - "grad_norm": 2.9557483196258545, - "learning_rate": 9.874851854881565e-07, - "loss": 0.0384, - "step": 1450 - }, - { - "epoch": 7.078048780487805, - "grad_norm": 1.6237356662750244, - "learning_rate": 9.844364725834058e-07, - "loss": 0.0116, - "step": 1451 - }, - { - "epoch": 7.082926829268293, - "grad_norm": 3.7120912075042725, - "learning_rate": 9.813913189908571e-07, - "loss": 0.0267, - "step": 1452 - }, - { - "epoch": 7.087804878048781, - "grad_norm": 1.9991087913513184, - "learning_rate": 9.783497318620783e-07, - "loss": 0.0376, - "step": 1453 - }, - { - "epoch": 7.092682926829268, - "grad_norm": 1.5474026203155518, - "learning_rate": 9.75311718340258e-07, - "loss": 0.0057, - "step": 1454 - }, - { - "epoch": 7.097560975609756, - "grad_norm": 2.060807943344116, - "learning_rate": 9.722772855601927e-07, - "loss": 0.0386, - "step": 1455 - }, - { - "epoch": 7.102439024390244, - "grad_norm": 1.1991411447525024, - "learning_rate": 9.692464406482727e-07, - "loss": 0.006, - "step": 1456 - }, - { - "epoch": 7.107317073170732, - "grad_norm": 1.8907703161239624, - "learning_rate": 9.662191907224582e-07, - "loss": 0.0066, - "step": 1457 - }, - { - "epoch": 7.11219512195122, - "grad_norm": 2.0351309776306152, - "learning_rate": 9.63195542892268e-07, - "loss": 0.0201, - "step": 1458 - }, - { - "epoch": 7.117073170731707, - "grad_norm": 1.3973944187164307, - "learning_rate": 9.601755042587624e-07, - "loss": 0.0112, - "step": 1459 - }, - { - "epoch": 7.121951219512195, - "grad_norm": 1.3639394044876099, - "learning_rate": 9.571590819145244e-07, - "loss": 0.0066, - "step": 1460 - }, - { - "epoch": 7.126829268292683, - "grad_norm": 1.7362885475158691, - "learning_rate": 9.541462829436426e-07, - "loss": 0.0136, - "step": 1461 - }, - { - "epoch": 7.131707317073171, - "grad_norm": 2.9414384365081787, - "learning_rate": 9.511371144217005e-07, - "loss": 0.0228, - "step": 1462 - }, - { - "epoch": 7.136585365853659, - "grad_norm": 2.944575548171997, - "learning_rate": 9.481315834157512e-07, - "loss": 0.027, - "step": 1463 - }, - { - "epoch": 7.1414634146341465, - "grad_norm": 2.4692747592926025, - "learning_rate": 9.451296969843058e-07, - "loss": 0.0152, - "step": 1464 - }, - { - "epoch": 7.146341463414634, - "grad_norm": 1.804129719734192, - "learning_rate": 9.42131462177319e-07, - "loss": 0.0071, - "step": 1465 - }, - { - "epoch": 7.151219512195122, - "grad_norm": 1.8012168407440186, - "learning_rate": 9.39136886036166e-07, - "loss": 0.0054, - "step": 1466 - }, - { - "epoch": 7.15609756097561, - "grad_norm": 1.9471648931503296, - "learning_rate": 9.361459755936316e-07, - "loss": 0.0067, - "step": 1467 - }, - { - "epoch": 7.160975609756098, - "grad_norm": 1.8837870359420776, - "learning_rate": 9.331587378738902e-07, - "loss": 0.0105, - "step": 1468 - }, - { - "epoch": 7.1658536585365855, - "grad_norm": 2.358891487121582, - "learning_rate": 9.301751798924935e-07, - "loss": 0.0331, - "step": 1469 - }, - { - "epoch": 7.170731707317073, - "grad_norm": 1.1501671075820923, - "learning_rate": 9.27195308656349e-07, - "loss": 0.0076, - "step": 1470 - }, - { - "epoch": 7.175609756097561, - "grad_norm": 2.3329083919525146, - "learning_rate": 9.24219131163705e-07, - "loss": 0.0243, - "step": 1471 - }, - { - "epoch": 7.180487804878049, - "grad_norm": 1.6030691862106323, - "learning_rate": 9.212466544041385e-07, - "loss": 0.0051, - "step": 1472 - }, - { - "epoch": 7.185365853658537, - "grad_norm": 2.005582094192505, - "learning_rate": 9.182778853585325e-07, - "loss": 0.0146, - "step": 1473 - }, - { - "epoch": 7.190243902439025, - "grad_norm": 1.86012601852417, - "learning_rate": 9.153128309990622e-07, - "loss": 0.0273, - "step": 1474 - }, - { - "epoch": 7.195121951219512, - "grad_norm": 2.218923568725586, - "learning_rate": 9.123514982891813e-07, - "loss": 0.0225, - "step": 1475 - }, - { - "epoch": 7.2, - "grad_norm": 1.9950376749038696, - "learning_rate": 9.093938941836012e-07, - "loss": 0.0156, - "step": 1476 - }, - { - "epoch": 7.204878048780488, - "grad_norm": 1.6428661346435547, - "learning_rate": 9.064400256282757e-07, - "loss": 0.0158, - "step": 1477 - }, - { - "epoch": 7.209756097560976, - "grad_norm": 1.7983390092849731, - "learning_rate": 9.034898995603894e-07, - "loss": 0.0138, - "step": 1478 - }, - { - "epoch": 7.214634146341464, - "grad_norm": 2.2069218158721924, - "learning_rate": 9.00543522908334e-07, - "loss": 0.0308, - "step": 1479 - }, - { - "epoch": 7.219512195121951, - "grad_norm": 1.4668920040130615, - "learning_rate": 8.976009025916962e-07, - "loss": 0.006, - "step": 1480 - }, - { - "epoch": 7.224390243902439, - "grad_norm": 1.8956354856491089, - "learning_rate": 8.946620455212438e-07, - "loss": 0.0121, - "step": 1481 - }, - { - "epoch": 7.229268292682927, - "grad_norm": 2.5479676723480225, - "learning_rate": 8.917269585989027e-07, - "loss": 0.0424, - "step": 1482 - }, - { - "epoch": 7.234146341463415, - "grad_norm": 1.7482987642288208, - "learning_rate": 8.887956487177462e-07, - "loss": 0.0189, - "step": 1483 - }, - { - "epoch": 7.239024390243903, - "grad_norm": 1.5023657083511353, - "learning_rate": 8.858681227619789e-07, - "loss": 0.0118, - "step": 1484 - }, - { - "epoch": 7.2439024390243905, - "grad_norm": 1.2069121599197388, - "learning_rate": 8.829443876069163e-07, - "loss": 0.0043, - "step": 1485 - }, - { - "epoch": 7.248780487804878, - "grad_norm": 1.5843572616577148, - "learning_rate": 8.800244501189722e-07, - "loss": 0.0111, - "step": 1486 - }, - { - "epoch": 7.253658536585366, - "grad_norm": 2.541588544845581, - "learning_rate": 8.771083171556407e-07, - "loss": 0.0582, - "step": 1487 - }, - { - "epoch": 7.258536585365854, - "grad_norm": 0.9306992292404175, - "learning_rate": 8.741959955654833e-07, - "loss": 0.0051, - "step": 1488 - }, - { - "epoch": 7.263414634146342, - "grad_norm": 1.4105901718139648, - "learning_rate": 8.712874921881082e-07, - "loss": 0.0175, - "step": 1489 - }, - { - "epoch": 7.2682926829268295, - "grad_norm": 2.8943028450012207, - "learning_rate": 8.683828138541559e-07, - "loss": 0.0827, - "step": 1490 - }, - { - "epoch": 7.273170731707317, - "grad_norm": 2.512991428375244, - "learning_rate": 8.654819673852874e-07, - "loss": 0.0347, - "step": 1491 - }, - { - "epoch": 7.278048780487805, - "grad_norm": 1.6571681499481201, - "learning_rate": 8.625849595941608e-07, - "loss": 0.0055, - "step": 1492 - }, - { - "epoch": 7.282926829268293, - "grad_norm": 1.3162294626235962, - "learning_rate": 8.596917972844199e-07, - "loss": 0.0043, - "step": 1493 - }, - { - "epoch": 7.287804878048781, - "grad_norm": 1.761405110359192, - "learning_rate": 8.568024872506792e-07, - "loss": 0.0176, - "step": 1494 - }, - { - "epoch": 7.2926829268292686, - "grad_norm": 0.7546011805534363, - "learning_rate": 8.539170362785043e-07, - "loss": 0.0025, - "step": 1495 - }, - { - "epoch": 7.297560975609756, - "grad_norm": 1.6910885572433472, - "learning_rate": 8.510354511443975e-07, - "loss": 0.0093, - "step": 1496 - }, - { - "epoch": 7.302439024390244, - "grad_norm": 1.6627765893936157, - "learning_rate": 8.48157738615784e-07, - "loss": 0.0066, - "step": 1497 - }, - { - "epoch": 7.307317073170732, - "grad_norm": 0.8881242871284485, - "learning_rate": 8.452839054509926e-07, - "loss": 0.0055, - "step": 1498 - }, - { - "epoch": 7.31219512195122, - "grad_norm": 1.0791494846343994, - "learning_rate": 8.42413958399241e-07, - "loss": 0.0059, - "step": 1499 - }, - { - "epoch": 7.317073170731708, - "grad_norm": 1.5198945999145508, - "learning_rate": 8.39547904200623e-07, - "loss": 0.0049, - "step": 1500 - }, - { - "epoch": 7.321951219512195, - "grad_norm": 1.7168906927108765, - "learning_rate": 8.366857495860869e-07, - "loss": 0.0204, - "step": 1501 - }, - { - "epoch": 7.326829268292683, - "grad_norm": 1.70030677318573, - "learning_rate": 8.338275012774247e-07, - "loss": 0.0161, - "step": 1502 - }, - { - "epoch": 7.331707317073171, - "grad_norm": 2.1044130325317383, - "learning_rate": 8.309731659872522e-07, - "loss": 0.0088, - "step": 1503 - }, - { - "epoch": 7.336585365853659, - "grad_norm": 1.5040123462677002, - "learning_rate": 8.281227504189992e-07, - "loss": 0.0204, - "step": 1504 - }, - { - "epoch": 7.341463414634147, - "grad_norm": 1.6814212799072266, - "learning_rate": 8.252762612668869e-07, - "loss": 0.0238, - "step": 1505 - }, - { - "epoch": 7.3463414634146345, - "grad_norm": 2.2541606426239014, - "learning_rate": 8.224337052159154e-07, - "loss": 0.0063, - "step": 1506 - }, - { - "epoch": 7.351219512195122, - "grad_norm": 2.3999500274658203, - "learning_rate": 8.195950889418503e-07, - "loss": 0.0123, - "step": 1507 - }, - { - "epoch": 7.35609756097561, - "grad_norm": 2.8464221954345703, - "learning_rate": 8.167604191112021e-07, - "loss": 0.0296, - "step": 1508 - }, - { - "epoch": 7.360975609756098, - "grad_norm": 2.178104877471924, - "learning_rate": 8.139297023812131e-07, - "loss": 0.0148, - "step": 1509 - }, - { - "epoch": 7.365853658536586, - "grad_norm": 1.6489804983139038, - "learning_rate": 8.111029453998448e-07, - "loss": 0.0057, - "step": 1510 - }, - { - "epoch": 7.3707317073170735, - "grad_norm": 1.9705169200897217, - "learning_rate": 8.082801548057553e-07, - "loss": 0.0098, - "step": 1511 - }, - { - "epoch": 7.375609756097561, - "grad_norm": 1.2231075763702393, - "learning_rate": 8.05461337228289e-07, - "loss": 0.007, - "step": 1512 - }, - { - "epoch": 7.380487804878049, - "grad_norm": 1.5212552547454834, - "learning_rate": 8.026464992874617e-07, - "loss": 0.0058, - "step": 1513 - }, - { - "epoch": 7.385365853658537, - "grad_norm": 0.5752282738685608, - "learning_rate": 7.998356475939398e-07, - "loss": 0.0011, - "step": 1514 - }, - { - "epoch": 7.390243902439025, - "grad_norm": 1.3227447271347046, - "learning_rate": 7.970287887490289e-07, - "loss": 0.0041, - "step": 1515 - }, - { - "epoch": 7.3951219512195125, - "grad_norm": 1.2051570415496826, - "learning_rate": 7.942259293446594e-07, - "loss": 0.0027, - "step": 1516 - }, - { - "epoch": 7.4, - "grad_norm": 1.4740777015686035, - "learning_rate": 7.914270759633669e-07, - "loss": 0.006, - "step": 1517 - }, - { - "epoch": 7.404878048780488, - "grad_norm": 1.8853001594543457, - "learning_rate": 7.886322351782782e-07, - "loss": 0.0066, - "step": 1518 - }, - { - "epoch": 7.409756097560976, - "grad_norm": 1.907251238822937, - "learning_rate": 7.858414135530995e-07, - "loss": 0.0133, - "step": 1519 - }, - { - "epoch": 7.414634146341464, - "grad_norm": 1.3397895097732544, - "learning_rate": 7.83054617642095e-07, - "loss": 0.0092, - "step": 1520 - }, - { - "epoch": 7.419512195121952, - "grad_norm": 2.878927707672119, - "learning_rate": 7.802718539900761e-07, - "loss": 0.0113, - "step": 1521 - }, - { - "epoch": 7.424390243902439, - "grad_norm": 1.0312106609344482, - "learning_rate": 7.774931291323826e-07, - "loss": 0.0045, - "step": 1522 - }, - { - "epoch": 7.429268292682927, - "grad_norm": 2.2703888416290283, - "learning_rate": 7.747184495948723e-07, - "loss": 0.0692, - "step": 1523 - }, - { - "epoch": 7.434146341463415, - "grad_norm": 3.0323078632354736, - "learning_rate": 7.719478218939e-07, - "loss": 0.0462, - "step": 1524 - }, - { - "epoch": 7.439024390243903, - "grad_norm": 1.4211952686309814, - "learning_rate": 7.691812525363044e-07, - "loss": 0.008, - "step": 1525 - }, - { - "epoch": 7.443902439024391, - "grad_norm": 0.9588236808776855, - "learning_rate": 7.66418748019396e-07, - "loss": 0.0042, - "step": 1526 - }, - { - "epoch": 7.4487804878048784, - "grad_norm": 2.837219476699829, - "learning_rate": 7.636603148309363e-07, - "loss": 0.0033, - "step": 1527 - }, - { - "epoch": 7.453658536585366, - "grad_norm": 1.8552638292312622, - "learning_rate": 7.609059594491253e-07, - "loss": 0.0181, - "step": 1528 - }, - { - "epoch": 7.458536585365854, - "grad_norm": 4.836069583892822, - "learning_rate": 7.581556883425886e-07, - "loss": 0.1868, - "step": 1529 - }, - { - "epoch": 7.463414634146342, - "grad_norm": 2.180760622024536, - "learning_rate": 7.55409507970358e-07, - "loss": 0.0305, - "step": 1530 - }, - { - "epoch": 7.46829268292683, - "grad_norm": 1.0799378156661987, - "learning_rate": 7.526674247818569e-07, - "loss": 0.0027, - "step": 1531 - }, - { - "epoch": 7.473170731707317, - "grad_norm": 2.1196658611297607, - "learning_rate": 7.499294452168904e-07, - "loss": 0.019, - "step": 1532 - }, - { - "epoch": 7.478048780487805, - "grad_norm": 1.6932553052902222, - "learning_rate": 7.471955757056227e-07, - "loss": 0.0101, - "step": 1533 - }, - { - "epoch": 7.482926829268292, - "grad_norm": 1.3473751544952393, - "learning_rate": 7.444658226685656e-07, - "loss": 0.0066, - "step": 1534 - }, - { - "epoch": 7.487804878048781, - "grad_norm": 2.3404016494750977, - "learning_rate": 7.417401925165666e-07, - "loss": 0.0139, - "step": 1535 - }, - { - "epoch": 7.492682926829268, - "grad_norm": 1.2845433950424194, - "learning_rate": 7.390186916507869e-07, - "loss": 0.0053, - "step": 1536 - }, - { - "epoch": 7.4975609756097565, - "grad_norm": 1.0809649229049683, - "learning_rate": 7.363013264626914e-07, - "loss": 0.0031, - "step": 1537 - }, - { - "epoch": 7.5024390243902435, - "grad_norm": 2.2649292945861816, - "learning_rate": 7.335881033340334e-07, - "loss": 0.0257, - "step": 1538 - }, - { - "epoch": 7.507317073170732, - "grad_norm": 1.3488918542861938, - "learning_rate": 7.308790286368373e-07, - "loss": 0.0092, - "step": 1539 - }, - { - "epoch": 7.512195121951219, - "grad_norm": 2.239190101623535, - "learning_rate": 7.281741087333846e-07, - "loss": 0.024, - "step": 1540 - }, - { - "epoch": 7.517073170731708, - "grad_norm": 1.9454522132873535, - "learning_rate": 7.254733499761993e-07, - "loss": 0.0177, - "step": 1541 - }, - { - "epoch": 7.521951219512195, - "grad_norm": 1.9299415349960327, - "learning_rate": 7.22776758708035e-07, - "loss": 0.0439, - "step": 1542 - }, - { - "epoch": 7.526829268292683, - "grad_norm": 2.2676074504852295, - "learning_rate": 7.200843412618555e-07, - "loss": 0.0387, - "step": 1543 - }, - { - "epoch": 7.53170731707317, - "grad_norm": 1.2385426759719849, - "learning_rate": 7.173961039608227e-07, - "loss": 0.0082, - "step": 1544 - }, - { - "epoch": 7.536585365853659, - "grad_norm": 1.8637615442276, - "learning_rate": 7.147120531182828e-07, - "loss": 0.0194, - "step": 1545 - }, - { - "epoch": 7.541463414634146, - "grad_norm": 1.6695958375930786, - "learning_rate": 7.120321950377487e-07, - "loss": 0.006, - "step": 1546 - }, - { - "epoch": 7.546341463414635, - "grad_norm": 1.916746735572815, - "learning_rate": 7.093565360128863e-07, - "loss": 0.0104, - "step": 1547 - }, - { - "epoch": 7.5512195121951216, - "grad_norm": 1.6002378463745117, - "learning_rate": 7.066850823275024e-07, - "loss": 0.0173, - "step": 1548 - }, - { - "epoch": 7.55609756097561, - "grad_norm": 1.5249438285827637, - "learning_rate": 7.040178402555245e-07, - "loss": 0.0088, - "step": 1549 - }, - { - "epoch": 7.560975609756097, - "grad_norm": 2.1726534366607666, - "learning_rate": 7.013548160609901e-07, - "loss": 0.0098, - "step": 1550 - }, - { - "epoch": 7.565853658536585, - "grad_norm": 1.901904582977295, - "learning_rate": 6.986960159980327e-07, - "loss": 0.0196, - "step": 1551 - }, - { - "epoch": 7.570731707317073, - "grad_norm": 2.577242136001587, - "learning_rate": 6.960414463108631e-07, - "loss": 0.021, - "step": 1552 - }, - { - "epoch": 7.575609756097561, - "grad_norm": 1.4463082551956177, - "learning_rate": 6.933911132337575e-07, - "loss": 0.0076, - "step": 1553 - }, - { - "epoch": 7.580487804878048, - "grad_norm": 2.5811946392059326, - "learning_rate": 6.907450229910443e-07, - "loss": 0.0204, - "step": 1554 - }, - { - "epoch": 7.585365853658536, - "grad_norm": 1.0530297756195068, - "learning_rate": 6.881031817970848e-07, - "loss": 0.0046, - "step": 1555 - }, - { - "epoch": 7.590243902439024, - "grad_norm": 2.995915651321411, - "learning_rate": 6.854655958562625e-07, - "loss": 0.0566, - "step": 1556 - }, - { - "epoch": 7.595121951219512, - "grad_norm": 1.253089189529419, - "learning_rate": 6.82832271362969e-07, - "loss": 0.0048, - "step": 1557 - }, - { - "epoch": 7.6, - "grad_norm": 2.830667495727539, - "learning_rate": 6.802032145015855e-07, - "loss": 0.0351, - "step": 1558 - }, - { - "epoch": 7.6048780487804875, - "grad_norm": 2.8280539512634277, - "learning_rate": 6.775784314464717e-07, - "loss": 0.0171, - "step": 1559 - }, - { - "epoch": 7.609756097560975, - "grad_norm": 1.7876580953598022, - "learning_rate": 6.749579283619492e-07, - "loss": 0.01, - "step": 1560 - }, - { - "epoch": 7.614634146341463, - "grad_norm": 1.540212869644165, - "learning_rate": 6.723417114022907e-07, - "loss": 0.0162, - "step": 1561 - }, - { - "epoch": 7.619512195121951, - "grad_norm": 2.5126969814300537, - "learning_rate": 6.697297867117e-07, - "loss": 0.0237, - "step": 1562 - }, - { - "epoch": 7.624390243902439, - "grad_norm": 1.5419458150863647, - "learning_rate": 6.671221604243014e-07, - "loss": 0.0116, - "step": 1563 - }, - { - "epoch": 7.6292682926829265, - "grad_norm": 3.469961404800415, - "learning_rate": 6.645188386641257e-07, - "loss": 0.0506, - "step": 1564 - }, - { - "epoch": 7.634146341463414, - "grad_norm": 0.8771130442619324, - "learning_rate": 6.61919827545093e-07, - "loss": 0.002, - "step": 1565 - }, - { - "epoch": 7.639024390243902, - "grad_norm": 3.036559820175171, - "learning_rate": 6.593251331709993e-07, - "loss": 0.0673, - "step": 1566 - }, - { - "epoch": 7.64390243902439, - "grad_norm": 3.379220724105835, - "learning_rate": 6.567347616355049e-07, - "loss": 0.063, - "step": 1567 - }, - { - "epoch": 7.648780487804878, - "grad_norm": 0.7666990756988525, - "learning_rate": 6.541487190221163e-07, - "loss": 0.003, - "step": 1568 - }, - { - "epoch": 7.6536585365853655, - "grad_norm": 1.2181665897369385, - "learning_rate": 6.515670114041725e-07, - "loss": 0.0037, - "step": 1569 - }, - { - "epoch": 7.658536585365853, - "grad_norm": 1.0194541215896606, - "learning_rate": 6.489896448448349e-07, - "loss": 0.0043, - "step": 1570 - }, - { - "epoch": 7.663414634146341, - "grad_norm": 2.2625741958618164, - "learning_rate": 6.464166253970672e-07, - "loss": 0.0144, - "step": 1571 - }, - { - "epoch": 7.668292682926829, - "grad_norm": 1.0256692171096802, - "learning_rate": 6.43847959103624e-07, - "loss": 0.0029, - "step": 1572 - }, - { - "epoch": 7.673170731707317, - "grad_norm": 2.0418128967285156, - "learning_rate": 6.412836519970383e-07, - "loss": 0.0144, - "step": 1573 - }, - { - "epoch": 7.678048780487805, - "grad_norm": 0.8498746752738953, - "learning_rate": 6.387237100996041e-07, - "loss": 0.0026, - "step": 1574 - }, - { - "epoch": 7.682926829268292, - "grad_norm": 1.1043775081634521, - "learning_rate": 6.361681394233631e-07, - "loss": 0.0093, - "step": 1575 - }, - { - "epoch": 7.68780487804878, - "grad_norm": 1.064835786819458, - "learning_rate": 6.336169459700933e-07, - "loss": 0.0081, - "step": 1576 - }, - { - "epoch": 7.692682926829268, - "grad_norm": 1.2024056911468506, - "learning_rate": 6.310701357312909e-07, - "loss": 0.0054, - "step": 1577 - }, - { - "epoch": 7.697560975609756, - "grad_norm": 1.9509804248809814, - "learning_rate": 6.285277146881588e-07, - "loss": 0.0051, - "step": 1578 - }, - { - "epoch": 7.702439024390244, - "grad_norm": 1.8738386631011963, - "learning_rate": 6.259896888115904e-07, - "loss": 0.0118, - "step": 1579 - }, - { - "epoch": 7.7073170731707314, - "grad_norm": 1.356726884841919, - "learning_rate": 6.234560640621606e-07, - "loss": 0.009, - "step": 1580 - }, - { - "epoch": 7.712195121951219, - "grad_norm": 0.6530736684799194, - "learning_rate": 6.209268463901047e-07, - "loss": 0.0015, - "step": 1581 - }, - { - "epoch": 7.717073170731707, - "grad_norm": 1.3714262247085571, - "learning_rate": 6.184020417353084e-07, - "loss": 0.0051, - "step": 1582 - }, - { - "epoch": 7.721951219512195, - "grad_norm": 3.015583038330078, - "learning_rate": 6.158816560272962e-07, - "loss": 0.0383, - "step": 1583 - }, - { - "epoch": 7.726829268292683, - "grad_norm": 3.2355704307556152, - "learning_rate": 6.133656951852113e-07, - "loss": 0.0422, - "step": 1584 - }, - { - "epoch": 7.7317073170731705, - "grad_norm": 1.2933087348937988, - "learning_rate": 6.10854165117806e-07, - "loss": 0.0082, - "step": 1585 - }, - { - "epoch": 7.736585365853658, - "grad_norm": 1.6866157054901123, - "learning_rate": 6.083470717234285e-07, - "loss": 0.0052, - "step": 1586 - }, - { - "epoch": 7.741463414634146, - "grad_norm": 1.4597362279891968, - "learning_rate": 6.058444208900061e-07, - "loss": 0.0094, - "step": 1587 - }, - { - "epoch": 7.746341463414634, - "grad_norm": 0.9200596213340759, - "learning_rate": 6.033462184950317e-07, - "loss": 0.0034, - "step": 1588 - }, - { - "epoch": 7.751219512195122, - "grad_norm": 1.707422137260437, - "learning_rate": 6.008524704055535e-07, - "loss": 0.0141, - "step": 1589 - }, - { - "epoch": 7.7560975609756095, - "grad_norm": 1.8554565906524658, - "learning_rate": 5.983631824781572e-07, - "loss": 0.0108, - "step": 1590 - }, - { - "epoch": 7.760975609756097, - "grad_norm": 1.5421279668807983, - "learning_rate": 5.95878360558953e-07, - "loss": 0.0075, - "step": 1591 - }, - { - "epoch": 7.765853658536585, - "grad_norm": 1.5643326044082642, - "learning_rate": 5.933980104835652e-07, - "loss": 0.018, - "step": 1592 - }, - { - "epoch": 7.770731707317073, - "grad_norm": 1.7024025917053223, - "learning_rate": 5.909221380771132e-07, - "loss": 0.0207, - "step": 1593 - }, - { - "epoch": 7.775609756097561, - "grad_norm": 1.820544719696045, - "learning_rate": 5.884507491542024e-07, - "loss": 0.0217, - "step": 1594 - }, - { - "epoch": 7.780487804878049, - "grad_norm": 1.6761897802352905, - "learning_rate": 5.859838495189068e-07, - "loss": 0.0055, - "step": 1595 - }, - { - "epoch": 7.785365853658536, - "grad_norm": 2.3035616874694824, - "learning_rate": 5.835214449647602e-07, - "loss": 0.0147, - "step": 1596 - }, - { - "epoch": 7.790243902439024, - "grad_norm": 2.0507681369781494, - "learning_rate": 5.810635412747373e-07, - "loss": 0.0065, - "step": 1597 - }, - { - "epoch": 7.795121951219512, - "grad_norm": 1.3789564371109009, - "learning_rate": 5.786101442212422e-07, - "loss": 0.0077, - "step": 1598 - }, - { - "epoch": 7.8, - "grad_norm": 3.313107490539551, - "learning_rate": 5.761612595660979e-07, - "loss": 0.0699, - "step": 1599 - }, - { - "epoch": 7.804878048780488, - "grad_norm": 1.2391237020492554, - "learning_rate": 5.737168930605272e-07, - "loss": 0.0017, - "step": 1600 - }, - { - "epoch": 7.809756097560975, - "grad_norm": 1.1187714338302612, - "learning_rate": 5.712770504451426e-07, - "loss": 0.0101, - "step": 1601 - }, - { - "epoch": 7.814634146341463, - "grad_norm": 2.7611069679260254, - "learning_rate": 5.688417374499336e-07, - "loss": 0.0143, - "step": 1602 - }, - { - "epoch": 7.819512195121951, - "grad_norm": 1.627295732498169, - "learning_rate": 5.664109597942504e-07, - "loss": 0.0062, - "step": 1603 - }, - { - "epoch": 7.824390243902439, - "grad_norm": 4.538354396820068, - "learning_rate": 5.639847231867917e-07, - "loss": 0.1058, - "step": 1604 - }, - { - "epoch": 7.829268292682927, - "grad_norm": 1.783469319343567, - "learning_rate": 5.61563033325594e-07, - "loss": 0.0178, - "step": 1605 - }, - { - "epoch": 7.8341463414634145, - "grad_norm": 2.259584665298462, - "learning_rate": 5.591458958980123e-07, - "loss": 0.0204, - "step": 1606 - }, - { - "epoch": 7.839024390243902, - "grad_norm": 2.0741965770721436, - "learning_rate": 5.567333165807115e-07, - "loss": 0.0201, - "step": 1607 - }, - { - "epoch": 7.84390243902439, - "grad_norm": 0.8751707077026367, - "learning_rate": 5.543253010396538e-07, - "loss": 0.0077, - "step": 1608 - }, - { - "epoch": 7.848780487804878, - "grad_norm": 1.7383732795715332, - "learning_rate": 5.519218549300806e-07, - "loss": 0.0176, - "step": 1609 - }, - { - "epoch": 7.853658536585366, - "grad_norm": 2.0462191104888916, - "learning_rate": 5.495229838965021e-07, - "loss": 0.031, - "step": 1610 - }, - { - "epoch": 7.8585365853658535, - "grad_norm": 1.3201459646224976, - "learning_rate": 5.471286935726866e-07, - "loss": 0.0062, - "step": 1611 - }, - { - "epoch": 7.863414634146341, - "grad_norm": 2.9285616874694824, - "learning_rate": 5.447389895816416e-07, - "loss": 0.0615, - "step": 1612 - }, - { - "epoch": 7.868292682926829, - "grad_norm": 3.1918647289276123, - "learning_rate": 5.423538775356049e-07, - "loss": 0.0377, - "step": 1613 - }, - { - "epoch": 7.873170731707317, - "grad_norm": 1.406246542930603, - "learning_rate": 5.399733630360287e-07, - "loss": 0.0122, - "step": 1614 - }, - { - "epoch": 7.878048780487805, - "grad_norm": 1.7651537656784058, - "learning_rate": 5.375974516735713e-07, - "loss": 0.015, - "step": 1615 - }, - { - "epoch": 7.882926829268293, - "grad_norm": 1.9614673852920532, - "learning_rate": 5.352261490280767e-07, - "loss": 0.0058, - "step": 1616 - }, - { - "epoch": 7.88780487804878, - "grad_norm": 1.6031639575958252, - "learning_rate": 5.328594606685661e-07, - "loss": 0.0041, - "step": 1617 - }, - { - "epoch": 7.892682926829268, - "grad_norm": 0.9787303805351257, - "learning_rate": 5.304973921532264e-07, - "loss": 0.0067, - "step": 1618 - }, - { - "epoch": 7.897560975609756, - "grad_norm": 1.2693779468536377, - "learning_rate": 5.281399490293923e-07, - "loss": 0.0064, - "step": 1619 - }, - { - "epoch": 7.902439024390244, - "grad_norm": 1.8421361446380615, - "learning_rate": 5.257871368335357e-07, - "loss": 0.0182, - "step": 1620 - }, - { - "epoch": 7.907317073170732, - "grad_norm": 0.9667096138000488, - "learning_rate": 5.234389610912552e-07, - "loss": 0.0024, - "step": 1621 - }, - { - "epoch": 7.912195121951219, - "grad_norm": 3.2266018390655518, - "learning_rate": 5.210954273172578e-07, - "loss": 0.02, - "step": 1622 - }, - { - "epoch": 7.917073170731707, - "grad_norm": 1.5821634531021118, - "learning_rate": 5.187565410153497e-07, - "loss": 0.024, - "step": 1623 - }, - { - "epoch": 7.921951219512195, - "grad_norm": 1.9864275455474854, - "learning_rate": 5.164223076784239e-07, - "loss": 0.0103, - "step": 1624 - }, - { - "epoch": 7.926829268292683, - "grad_norm": 1.866466999053955, - "learning_rate": 5.14092732788444e-07, - "loss": 0.0268, - "step": 1625 - }, - { - "epoch": 7.931707317073171, - "grad_norm": 1.165686011314392, - "learning_rate": 5.117678218164337e-07, - "loss": 0.0085, - "step": 1626 - }, - { - "epoch": 7.9365853658536585, - "grad_norm": 1.1883208751678467, - "learning_rate": 5.094475802224644e-07, - "loss": 0.006, - "step": 1627 - }, - { - "epoch": 7.941463414634146, - "grad_norm": 1.5121057033538818, - "learning_rate": 5.071320134556404e-07, - "loss": 0.003, - "step": 1628 - }, - { - "epoch": 7.946341463414634, - "grad_norm": 1.1923614740371704, - "learning_rate": 5.048211269540868e-07, - "loss": 0.0064, - "step": 1629 - }, - { - "epoch": 7.951219512195122, - "grad_norm": 1.33751380443573, - "learning_rate": 5.025149261449391e-07, - "loss": 0.0082, - "step": 1630 - }, - { - "epoch": 7.95609756097561, - "grad_norm": 1.9143925905227661, - "learning_rate": 5.002134164443262e-07, - "loss": 0.0202, - "step": 1631 - }, - { - "epoch": 7.9609756097560975, - "grad_norm": 1.2547078132629395, - "learning_rate": 4.979166032573607e-07, - "loss": 0.0033, - "step": 1632 - }, - { - "epoch": 7.965853658536585, - "grad_norm": 2.3050332069396973, - "learning_rate": 4.956244919781247e-07, - "loss": 0.052, - "step": 1633 - }, - { - "epoch": 7.970731707317073, - "grad_norm": 1.4462478160858154, - "learning_rate": 4.933370879896604e-07, - "loss": 0.0049, - "step": 1634 - }, - { - "epoch": 7.975609756097561, - "grad_norm": 1.519913911819458, - "learning_rate": 4.91054396663952e-07, - "loss": 0.0102, - "step": 1635 - }, - { - "epoch": 7.980487804878049, - "grad_norm": 2.9544193744659424, - "learning_rate": 4.887764233619163e-07, - "loss": 0.0112, - "step": 1636 - }, - { - "epoch": 7.985365853658537, - "grad_norm": 0.9778392314910889, - "learning_rate": 4.865031734333919e-07, - "loss": 0.0032, - "step": 1637 - }, - { - "epoch": 7.990243902439024, - "grad_norm": 2.783501386642456, - "learning_rate": 4.842346522171226e-07, - "loss": 0.012, - "step": 1638 - }, - { - "epoch": 7.995121951219512, - "grad_norm": 1.5644093751907349, - "learning_rate": 4.819708650407467e-07, - "loss": 0.0184, - "step": 1639 - }, - { - "epoch": 8.0, - "grad_norm": 1.5741018056869507, - "learning_rate": 4.797118172207863e-07, - "loss": 0.0112, - "step": 1640 - }, - { - "epoch": 8.004878048780487, - "grad_norm": 0.9010241031646729, - "learning_rate": 4.774575140626317e-07, - "loss": 0.0064, - "step": 1641 - }, - { - "epoch": 8.009756097560976, - "grad_norm": 0.8204272985458374, - "learning_rate": 4.752079608605295e-07, - "loss": 0.003, - "step": 1642 - }, - { - "epoch": 8.014634146341463, - "grad_norm": 1.8131763935089111, - "learning_rate": 4.7296316289757366e-07, - "loss": 0.0063, - "step": 1643 - }, - { - "epoch": 8.019512195121951, - "grad_norm": 0.9918075799942017, - "learning_rate": 4.7072312544568844e-07, - "loss": 0.0039, - "step": 1644 - }, - { - "epoch": 8.024390243902438, - "grad_norm": 0.5097177028656006, - "learning_rate": 4.6848785376561733e-07, - "loss": 0.0028, - "step": 1645 - }, - { - "epoch": 8.029268292682927, - "grad_norm": 0.3497299253940582, - "learning_rate": 4.6625735310691396e-07, - "loss": 0.0021, - "step": 1646 - }, - { - "epoch": 8.034146341463414, - "grad_norm": 0.9271900057792664, - "learning_rate": 4.6403162870792524e-07, - "loss": 0.005, - "step": 1647 - }, - { - "epoch": 8.039024390243902, - "grad_norm": 0.951755940914154, - "learning_rate": 4.618106857957805e-07, - "loss": 0.0042, - "step": 1648 - }, - { - "epoch": 8.04390243902439, - "grad_norm": 0.6863508820533752, - "learning_rate": 4.5959452958638213e-07, - "loss": 0.0014, - "step": 1649 - }, - { - "epoch": 8.048780487804878, - "grad_norm": 0.45382270216941833, - "learning_rate": 4.573831652843888e-07, - "loss": 0.0012, - "step": 1650 - }, - { - "epoch": 8.053658536585365, - "grad_norm": 1.8319289684295654, - "learning_rate": 4.55176598083206e-07, - "loss": 0.0234, - "step": 1651 - }, - { - "epoch": 8.058536585365854, - "grad_norm": 1.2312507629394531, - "learning_rate": 4.5297483316497276e-07, - "loss": 0.0042, - "step": 1652 - }, - { - "epoch": 8.06341463414634, - "grad_norm": 1.4057971239089966, - "learning_rate": 4.5077787570055097e-07, - "loss": 0.0085, - "step": 1653 - }, - { - "epoch": 8.06829268292683, - "grad_norm": 3.3510940074920654, - "learning_rate": 4.4858573084951173e-07, - "loss": 0.0628, - "step": 1654 - }, - { - "epoch": 8.073170731707316, - "grad_norm": 0.6469231247901917, - "learning_rate": 4.463984037601224e-07, - "loss": 0.0026, - "step": 1655 - }, - { - "epoch": 8.078048780487805, - "grad_norm": 0.9491491317749023, - "learning_rate": 4.4421589956933827e-07, - "loss": 0.0021, - "step": 1656 - }, - { - "epoch": 8.082926829268292, - "grad_norm": 1.0847301483154297, - "learning_rate": 4.420382234027859e-07, - "loss": 0.0042, - "step": 1657 - }, - { - "epoch": 8.08780487804878, - "grad_norm": 0.5364987254142761, - "learning_rate": 4.398653803747532e-07, - "loss": 0.0045, - "step": 1658 - }, - { - "epoch": 8.092682926829267, - "grad_norm": 1.057804822921753, - "learning_rate": 4.3769737558817996e-07, - "loss": 0.0015, - "step": 1659 - }, - { - "epoch": 8.097560975609756, - "grad_norm": 1.2050957679748535, - "learning_rate": 4.355342141346405e-07, - "loss": 0.0124, - "step": 1660 - }, - { - "epoch": 8.102439024390243, - "grad_norm": 0.2821386754512787, - "learning_rate": 4.3337590109433505e-07, - "loss": 0.002, - "step": 1661 - }, - { - "epoch": 8.107317073170732, - "grad_norm": 0.7883970141410828, - "learning_rate": 4.3122244153607914e-07, - "loss": 0.0013, - "step": 1662 - }, - { - "epoch": 8.112195121951219, - "grad_norm": 1.1907166242599487, - "learning_rate": 4.2907384051728754e-07, - "loss": 0.0201, - "step": 1663 - }, - { - "epoch": 8.117073170731707, - "grad_norm": 1.3646314144134521, - "learning_rate": 4.2693010308396566e-07, - "loss": 0.0039, - "step": 1664 - }, - { - "epoch": 8.121951219512194, - "grad_norm": 2.0689423084259033, - "learning_rate": 4.247912342706975e-07, - "loss": 0.0035, - "step": 1665 - }, - { - "epoch": 8.126829268292683, - "grad_norm": 0.4086499810218811, - "learning_rate": 4.22657239100632e-07, - "loss": 0.0009, - "step": 1666 - }, - { - "epoch": 8.13170731707317, - "grad_norm": 0.9431869387626648, - "learning_rate": 4.2052812258547265e-07, - "loss": 0.0018, - "step": 1667 - }, - { - "epoch": 8.136585365853659, - "grad_norm": 0.9063575863838196, - "learning_rate": 4.184038897254655e-07, - "loss": 0.0021, - "step": 1668 - }, - { - "epoch": 8.141463414634146, - "grad_norm": 2.707298517227173, - "learning_rate": 4.1628454550938697e-07, - "loss": 0.019, - "step": 1669 - }, - { - "epoch": 8.146341463414634, - "grad_norm": 1.687988042831421, - "learning_rate": 4.141700949145322e-07, - "loss": 0.0144, - "step": 1670 - }, - { - "epoch": 8.151219512195121, - "grad_norm": 0.8905831575393677, - "learning_rate": 4.1206054290670537e-07, - "loss": 0.0088, - "step": 1671 - }, - { - "epoch": 8.15609756097561, - "grad_norm": 1.418512225151062, - "learning_rate": 4.0995589444020433e-07, - "loss": 0.0083, - "step": 1672 - }, - { - "epoch": 8.160975609756097, - "grad_norm": 1.1676236391067505, - "learning_rate": 4.0785615445781106e-07, - "loss": 0.0027, - "step": 1673 - }, - { - "epoch": 8.165853658536586, - "grad_norm": 1.5615407228469849, - "learning_rate": 4.057613278907818e-07, - "loss": 0.0089, - "step": 1674 - }, - { - "epoch": 8.170731707317072, - "grad_norm": 1.0604172945022583, - "learning_rate": 4.036714196588318e-07, - "loss": 0.0034, - "step": 1675 - }, - { - "epoch": 8.175609756097561, - "grad_norm": 1.3175733089447021, - "learning_rate": 4.015864346701251e-07, - "loss": 0.0021, - "step": 1676 - }, - { - "epoch": 8.180487804878048, - "grad_norm": 0.2539370059967041, - "learning_rate": 3.99506377821266e-07, - "loss": 0.0005, - "step": 1677 - }, - { - "epoch": 8.185365853658537, - "grad_norm": 0.8106228113174438, - "learning_rate": 3.97431253997283e-07, - "loss": 0.003, - "step": 1678 - }, - { - "epoch": 8.190243902439024, - "grad_norm": 0.6703351140022278, - "learning_rate": 3.9536106807161857e-07, - "loss": 0.0028, - "step": 1679 - }, - { - "epoch": 8.195121951219512, - "grad_norm": 1.2921632528305054, - "learning_rate": 3.932958249061214e-07, - "loss": 0.0097, - "step": 1680 - }, - { - "epoch": 8.2, - "grad_norm": 0.7795253992080688, - "learning_rate": 3.9123552935102976e-07, - "loss": 0.004, - "step": 1681 - }, - { - "epoch": 8.204878048780488, - "grad_norm": 1.3402642011642456, - "learning_rate": 3.891801862449629e-07, - "loss": 0.0189, - "step": 1682 - }, - { - "epoch": 8.209756097560975, - "grad_norm": 0.6951391696929932, - "learning_rate": 3.8712980041490905e-07, - "loss": 0.0038, - "step": 1683 - }, - { - "epoch": 8.214634146341464, - "grad_norm": 0.8145114183425903, - "learning_rate": 3.850843766762155e-07, - "loss": 0.0038, - "step": 1684 - }, - { - "epoch": 8.21951219512195, - "grad_norm": 0.30702775716781616, - "learning_rate": 3.830439198325747e-07, - "loss": 0.0008, - "step": 1685 - }, - { - "epoch": 8.22439024390244, - "grad_norm": 0.45050138235092163, - "learning_rate": 3.81008434676014e-07, - "loss": 0.0013, - "step": 1686 - }, - { - "epoch": 8.229268292682926, - "grad_norm": 0.7875486612319946, - "learning_rate": 3.789779259868864e-07, - "loss": 0.0016, - "step": 1687 - }, - { - "epoch": 8.234146341463415, - "grad_norm": 0.9437265396118164, - "learning_rate": 3.769523985338566e-07, - "loss": 0.0045, - "step": 1688 - }, - { - "epoch": 8.239024390243902, - "grad_norm": 1.2928845882415771, - "learning_rate": 3.749318570738897e-07, - "loss": 0.0057, - "step": 1689 - }, - { - "epoch": 8.24390243902439, - "grad_norm": 0.9615103006362915, - "learning_rate": 3.7291630635224397e-07, - "loss": 0.0026, - "step": 1690 - }, - { - "epoch": 8.248780487804877, - "grad_norm": 0.8654932975769043, - "learning_rate": 3.709057511024541e-07, - "loss": 0.0056, - "step": 1691 - }, - { - "epoch": 8.253658536585366, - "grad_norm": 1.1101908683776855, - "learning_rate": 3.689001960463243e-07, - "loss": 0.0019, - "step": 1692 - }, - { - "epoch": 8.258536585365853, - "grad_norm": 0.9586653709411621, - "learning_rate": 3.668996458939156e-07, - "loss": 0.003, - "step": 1693 - }, - { - "epoch": 8.263414634146342, - "grad_norm": 1.1638360023498535, - "learning_rate": 3.649041053435354e-07, - "loss": 0.0031, - "step": 1694 - }, - { - "epoch": 8.268292682926829, - "grad_norm": 0.41364336013793945, - "learning_rate": 3.62913579081724e-07, - "loss": 0.0012, - "step": 1695 - }, - { - "epoch": 8.273170731707317, - "grad_norm": 1.1794198751449585, - "learning_rate": 3.609280717832489e-07, - "loss": 0.0067, - "step": 1696 - }, - { - "epoch": 8.278048780487804, - "grad_norm": 0.7281041741371155, - "learning_rate": 3.5894758811108795e-07, - "loss": 0.002, - "step": 1697 - }, - { - "epoch": 8.282926829268293, - "grad_norm": 0.42419376969337463, - "learning_rate": 3.5697213271642164e-07, - "loss": 0.0008, - "step": 1698 - }, - { - "epoch": 8.28780487804878, - "grad_norm": 0.6596572995185852, - "learning_rate": 3.5500171023862136e-07, - "loss": 0.0028, - "step": 1699 - }, - { - "epoch": 8.292682926829269, - "grad_norm": 1.236666202545166, - "learning_rate": 3.530363253052399e-07, - "loss": 0.0032, - "step": 1700 - }, - { - "epoch": 8.297560975609755, - "grad_norm": 0.977694571018219, - "learning_rate": 3.510759825319976e-07, - "loss": 0.0068, - "step": 1701 - }, - { - "epoch": 8.302439024390244, - "grad_norm": 1.0168365240097046, - "learning_rate": 3.491206865227739e-07, - "loss": 0.0017, - "step": 1702 - }, - { - "epoch": 8.307317073170731, - "grad_norm": 2.269639253616333, - "learning_rate": 3.4717044186959676e-07, - "loss": 0.0398, - "step": 1703 - }, - { - "epoch": 8.31219512195122, - "grad_norm": 1.0657192468643188, - "learning_rate": 3.452252531526301e-07, - "loss": 0.0049, - "step": 1704 - }, - { - "epoch": 8.317073170731707, - "grad_norm": 1.50715970993042, - "learning_rate": 3.432851249401628e-07, - "loss": 0.0164, - "step": 1705 - }, - { - "epoch": 8.321951219512195, - "grad_norm": 0.701214611530304, - "learning_rate": 3.413500617886023e-07, - "loss": 0.0038, - "step": 1706 - }, - { - "epoch": 8.326829268292682, - "grad_norm": 1.6810981035232544, - "learning_rate": 3.394200682424578e-07, - "loss": 0.0118, - "step": 1707 - }, - { - "epoch": 8.331707317073171, - "grad_norm": 1.4712997674942017, - "learning_rate": 3.374951488343328e-07, - "loss": 0.006, - "step": 1708 - }, - { - "epoch": 8.336585365853658, - "grad_norm": 0.6115317940711975, - "learning_rate": 3.355753080849164e-07, - "loss": 0.0011, - "step": 1709 - }, - { - "epoch": 8.341463414634147, - "grad_norm": 0.8171163201332092, - "learning_rate": 3.3366055050296776e-07, - "loss": 0.0024, - "step": 1710 - }, - { - "epoch": 8.346341463414634, - "grad_norm": 0.7722201943397522, - "learning_rate": 3.3175088058530925e-07, - "loss": 0.0028, - "step": 1711 - }, - { - "epoch": 8.351219512195122, - "grad_norm": 3.0709617137908936, - "learning_rate": 3.2984630281681556e-07, - "loss": 0.0109, - "step": 1712 - }, - { - "epoch": 8.35609756097561, - "grad_norm": 1.7634369134902954, - "learning_rate": 3.2794682167040125e-07, - "loss": 0.0031, - "step": 1713 - }, - { - "epoch": 8.360975609756098, - "grad_norm": 1.7657215595245361, - "learning_rate": 3.2605244160701155e-07, - "loss": 0.01, - "step": 1714 - }, - { - "epoch": 8.365853658536585, - "grad_norm": 1.432230830192566, - "learning_rate": 3.2416316707561316e-07, - "loss": 0.0042, - "step": 1715 - }, - { - "epoch": 8.370731707317074, - "grad_norm": 0.465900719165802, - "learning_rate": 3.2227900251318055e-07, - "loss": 0.0021, - "step": 1716 - }, - { - "epoch": 8.37560975609756, - "grad_norm": 1.3770387172698975, - "learning_rate": 3.2039995234468854e-07, - "loss": 0.0031, - "step": 1717 - }, - { - "epoch": 8.38048780487805, - "grad_norm": 0.4842236638069153, - "learning_rate": 3.1852602098309984e-07, - "loss": 0.0009, - "step": 1718 - }, - { - "epoch": 8.385365853658536, - "grad_norm": 0.6840565204620361, - "learning_rate": 3.1665721282935683e-07, - "loss": 0.0047, - "step": 1719 - }, - { - "epoch": 8.390243902439025, - "grad_norm": 0.5206313729286194, - "learning_rate": 3.147935322723694e-07, - "loss": 0.0026, - "step": 1720 - }, - { - "epoch": 8.395121951219512, - "grad_norm": 1.131412386894226, - "learning_rate": 3.1293498368900414e-07, - "loss": 0.0019, - "step": 1721 - }, - { - "epoch": 8.4, - "grad_norm": 0.5872076153755188, - "learning_rate": 3.1108157144407765e-07, - "loss": 0.0009, - "step": 1722 - }, - { - "epoch": 8.404878048780487, - "grad_norm": 1.1455132961273193, - "learning_rate": 3.092332998903416e-07, - "loss": 0.0047, - "step": 1723 - }, - { - "epoch": 8.409756097560976, - "grad_norm": 1.4331532716751099, - "learning_rate": 3.073901733684748e-07, - "loss": 0.0162, - "step": 1724 - }, - { - "epoch": 8.414634146341463, - "grad_norm": 0.8186633586883545, - "learning_rate": 3.055521962070751e-07, - "loss": 0.0078, - "step": 1725 - }, - { - "epoch": 8.419512195121952, - "grad_norm": 0.9004407525062561, - "learning_rate": 3.0371937272264454e-07, - "loss": 0.0035, - "step": 1726 - }, - { - "epoch": 8.424390243902439, - "grad_norm": 0.8009728789329529, - "learning_rate": 3.0189170721958234e-07, - "loss": 0.0011, - "step": 1727 - }, - { - "epoch": 8.429268292682927, - "grad_norm": 0.7846589088439941, - "learning_rate": 3.000692039901756e-07, - "loss": 0.0042, - "step": 1728 - }, - { - "epoch": 8.434146341463414, - "grad_norm": 1.2301117181777954, - "learning_rate": 2.982518673145862e-07, - "loss": 0.0159, - "step": 1729 - }, - { - "epoch": 8.439024390243903, - "grad_norm": 0.8503583073616028, - "learning_rate": 2.9643970146084193e-07, - "loss": 0.0021, - "step": 1730 - }, - { - "epoch": 8.44390243902439, - "grad_norm": 1.661842942237854, - "learning_rate": 2.9463271068482955e-07, - "loss": 0.0124, - "step": 1731 - }, - { - "epoch": 8.448780487804878, - "grad_norm": 0.7799263000488281, - "learning_rate": 2.928308992302792e-07, - "loss": 0.0038, - "step": 1732 - }, - { - "epoch": 8.453658536585365, - "grad_norm": 0.6021434664726257, - "learning_rate": 2.9103427132875785e-07, - "loss": 0.0013, - "step": 1733 - }, - { - "epoch": 8.458536585365854, - "grad_norm": 1.430431604385376, - "learning_rate": 2.892428311996609e-07, - "loss": 0.0151, - "step": 1734 - }, - { - "epoch": 8.463414634146341, - "grad_norm": 1.1589592695236206, - "learning_rate": 2.8745658305019824e-07, - "loss": 0.0037, - "step": 1735 - }, - { - "epoch": 8.46829268292683, - "grad_norm": 0.7232568860054016, - "learning_rate": 2.856755310753867e-07, - "loss": 0.0046, - "step": 1736 - }, - { - "epoch": 8.473170731707317, - "grad_norm": 0.6265125274658203, - "learning_rate": 2.8389967945803984e-07, - "loss": 0.0014, - "step": 1737 - }, - { - "epoch": 8.478048780487805, - "grad_norm": 0.7115193009376526, - "learning_rate": 2.821290323687592e-07, - "loss": 0.0036, - "step": 1738 - }, - { - "epoch": 8.482926829268292, - "grad_norm": 0.5157519578933716, - "learning_rate": 2.803635939659222e-07, - "loss": 0.0016, - "step": 1739 - }, - { - "epoch": 8.487804878048781, - "grad_norm": 0.9217156767845154, - "learning_rate": 2.786033683956732e-07, - "loss": 0.0052, - "step": 1740 - }, - { - "epoch": 8.492682926829268, - "grad_norm": 4.063957691192627, - "learning_rate": 2.7684835979191664e-07, - "loss": 0.0999, - "step": 1741 - }, - { - "epoch": 8.497560975609757, - "grad_norm": 0.38870275020599365, - "learning_rate": 2.7509857227630223e-07, - "loss": 0.0009, - "step": 1742 - }, - { - "epoch": 8.502439024390243, - "grad_norm": 0.8282430768013, - "learning_rate": 2.733540099582188e-07, - "loss": 0.0026, - "step": 1743 - }, - { - "epoch": 8.507317073170732, - "grad_norm": 1.7269257307052612, - "learning_rate": 2.7161467693478493e-07, - "loss": 0.0094, - "step": 1744 - }, - { - "epoch": 8.512195121951219, - "grad_norm": 1.4464598894119263, - "learning_rate": 2.6988057729083613e-07, - "loss": 0.006, - "step": 1745 - }, - { - "epoch": 8.517073170731708, - "grad_norm": 0.9648481011390686, - "learning_rate": 2.681517150989185e-07, - "loss": 0.0043, - "step": 1746 - }, - { - "epoch": 8.521951219512195, - "grad_norm": 0.7762707471847534, - "learning_rate": 2.664280944192782e-07, - "loss": 0.0026, - "step": 1747 - }, - { - "epoch": 8.526829268292683, - "grad_norm": 0.9751222133636475, - "learning_rate": 2.64709719299851e-07, - "loss": 0.0044, - "step": 1748 - }, - { - "epoch": 8.53170731707317, - "grad_norm": 0.5906254053115845, - "learning_rate": 2.6299659377625296e-07, - "loss": 0.0008, - "step": 1749 - }, - { - "epoch": 8.536585365853659, - "grad_norm": 1.9417753219604492, - "learning_rate": 2.612887218717733e-07, - "loss": 0.0324, - "step": 1750 - }, - { - "epoch": 8.541463414634146, - "grad_norm": 0.6434907913208008, - "learning_rate": 2.5958610759736133e-07, - "loss": 0.0028, - "step": 1751 - }, - { - "epoch": 8.546341463414635, - "grad_norm": 0.8546578884124756, - "learning_rate": 2.5788875495161846e-07, - "loss": 0.0019, - "step": 1752 - }, - { - "epoch": 8.551219512195122, - "grad_norm": 0.8363909721374512, - "learning_rate": 2.561966679207917e-07, - "loss": 0.0028, - "step": 1753 - }, - { - "epoch": 8.55609756097561, - "grad_norm": 1.4901739358901978, - "learning_rate": 2.545098504787588e-07, - "loss": 0.0266, - "step": 1754 - }, - { - "epoch": 8.560975609756097, - "grad_norm": 0.6730532646179199, - "learning_rate": 2.5282830658702323e-07, - "loss": 0.0009, - "step": 1755 - }, - { - "epoch": 8.565853658536586, - "grad_norm": 0.7190845608711243, - "learning_rate": 2.511520401947032e-07, - "loss": 0.0056, - "step": 1756 - }, - { - "epoch": 8.570731707317073, - "grad_norm": 0.441381573677063, - "learning_rate": 2.494810552385232e-07, - "loss": 0.0009, - "step": 1757 - }, - { - "epoch": 8.575609756097561, - "grad_norm": 1.103507399559021, - "learning_rate": 2.47815355642804e-07, - "loss": 0.0023, - "step": 1758 - }, - { - "epoch": 8.580487804878048, - "grad_norm": 1.994994878768921, - "learning_rate": 2.461549453194523e-07, - "loss": 0.0454, - "step": 1759 - }, - { - "epoch": 8.585365853658537, - "grad_norm": 2.3645970821380615, - "learning_rate": 2.444998281679553e-07, - "loss": 0.0204, - "step": 1760 - }, - { - "epoch": 8.590243902439024, - "grad_norm": 1.7933200597763062, - "learning_rate": 2.428500080753676e-07, - "loss": 0.0387, - "step": 1761 - }, - { - "epoch": 8.595121951219513, - "grad_norm": 1.6070597171783447, - "learning_rate": 2.412054889163035e-07, - "loss": 0.0014, - "step": 1762 - }, - { - "epoch": 8.6, - "grad_norm": 0.2842216193675995, - "learning_rate": 2.3956627455292924e-07, - "loss": 0.0011, - "step": 1763 - }, - { - "epoch": 8.604878048780488, - "grad_norm": 0.8213078379631042, - "learning_rate": 2.3793236883495164e-07, - "loss": 0.003, - "step": 1764 - }, - { - "epoch": 8.609756097560975, - "grad_norm": 0.9147091507911682, - "learning_rate": 2.363037755996095e-07, - "loss": 0.0032, - "step": 1765 - }, - { - "epoch": 8.614634146341464, - "grad_norm": 1.4246805906295776, - "learning_rate": 2.3468049867166747e-07, - "loss": 0.0037, - "step": 1766 - }, - { - "epoch": 8.61951219512195, - "grad_norm": 0.5553964376449585, - "learning_rate": 2.3306254186340305e-07, - "loss": 0.0014, - "step": 1767 - }, - { - "epoch": 8.62439024390244, - "grad_norm": 1.6941331624984741, - "learning_rate": 2.314499089745989e-07, - "loss": 0.0125, - "step": 1768 - }, - { - "epoch": 8.629268292682926, - "grad_norm": 2.965517520904541, - "learning_rate": 2.2984260379253693e-07, - "loss": 0.0855, - "step": 1769 - }, - { - "epoch": 8.634146341463415, - "grad_norm": 0.9295977354049683, - "learning_rate": 2.2824063009198428e-07, - "loss": 0.0031, - "step": 1770 - }, - { - "epoch": 8.639024390243902, - "grad_norm": 0.990189254283905, - "learning_rate": 2.2664399163518786e-07, - "loss": 0.0056, - "step": 1771 - }, - { - "epoch": 8.64390243902439, - "grad_norm": 1.7282871007919312, - "learning_rate": 2.25052692171866e-07, - "loss": 0.022, - "step": 1772 - }, - { - "epoch": 8.648780487804878, - "grad_norm": 1.2093932628631592, - "learning_rate": 2.2346673543919645e-07, - "loss": 0.0025, - "step": 1773 - }, - { - "epoch": 8.653658536585366, - "grad_norm": 0.9555385112762451, - "learning_rate": 2.2188612516181067e-07, - "loss": 0.0081, - "step": 1774 - }, - { - "epoch": 8.658536585365853, - "grad_norm": 0.7467104196548462, - "learning_rate": 2.203108650517835e-07, - "loss": 0.0015, - "step": 1775 - }, - { - "epoch": 8.663414634146342, - "grad_norm": 0.893450140953064, - "learning_rate": 2.1874095880862505e-07, - "loss": 0.0023, - "step": 1776 - }, - { - "epoch": 8.668292682926829, - "grad_norm": 1.0488923788070679, - "learning_rate": 2.171764101192722e-07, - "loss": 0.002, - "step": 1777 - }, - { - "epoch": 8.673170731707318, - "grad_norm": 1.1046003103256226, - "learning_rate": 2.1561722265807827e-07, - "loss": 0.002, - "step": 1778 - }, - { - "epoch": 8.678048780487805, - "grad_norm": 0.38860198855400085, - "learning_rate": 2.1406340008680748e-07, - "loss": 0.0015, - "step": 1779 - }, - { - "epoch": 8.682926829268293, - "grad_norm": 0.9970881938934326, - "learning_rate": 2.1251494605462358e-07, - "loss": 0.0028, - "step": 1780 - }, - { - "epoch": 8.68780487804878, - "grad_norm": 0.32808956503868103, - "learning_rate": 2.1097186419808151e-07, - "loss": 0.0008, - "step": 1781 - }, - { - "epoch": 8.692682926829269, - "grad_norm": 0.25458696484565735, - "learning_rate": 2.094341581411216e-07, - "loss": 0.0012, - "step": 1782 - }, - { - "epoch": 8.697560975609756, - "grad_norm": 0.3530316948890686, - "learning_rate": 2.0790183149505733e-07, - "loss": 0.0021, - "step": 1783 - }, - { - "epoch": 8.702439024390245, - "grad_norm": 0.6706930994987488, - "learning_rate": 2.063748878585689e-07, - "loss": 0.0028, - "step": 1784 - }, - { - "epoch": 8.707317073170731, - "grad_norm": 0.9568914175033569, - "learning_rate": 2.0485333081769588e-07, - "loss": 0.0018, - "step": 1785 - }, - { - "epoch": 8.71219512195122, - "grad_norm": 1.2713409662246704, - "learning_rate": 2.0333716394582536e-07, - "loss": 0.0142, - "step": 1786 - }, - { - "epoch": 8.717073170731707, - "grad_norm": 1.7427871227264404, - "learning_rate": 2.0182639080368634e-07, - "loss": 0.0135, - "step": 1787 - }, - { - "epoch": 8.721951219512196, - "grad_norm": 0.8939143419265747, - "learning_rate": 2.003210149393417e-07, - "loss": 0.0078, - "step": 1788 - }, - { - "epoch": 8.726829268292683, - "grad_norm": 1.1459598541259766, - "learning_rate": 1.9882103988817735e-07, - "loss": 0.0066, - "step": 1789 - }, - { - "epoch": 8.731707317073171, - "grad_norm": 0.875706672668457, - "learning_rate": 1.9732646917289545e-07, - "loss": 0.0051, - "step": 1790 - }, - { - "epoch": 8.736585365853658, - "grad_norm": 0.2884235084056854, - "learning_rate": 1.958373063035071e-07, - "loss": 0.001, - "step": 1791 - }, - { - "epoch": 8.741463414634147, - "grad_norm": 1.3679368495941162, - "learning_rate": 1.9435355477732205e-07, - "loss": 0.0057, - "step": 1792 - }, - { - "epoch": 8.746341463414634, - "grad_norm": 0.5913633108139038, - "learning_rate": 1.928752180789417e-07, - "loss": 0.0023, - "step": 1793 - }, - { - "epoch": 8.751219512195123, - "grad_norm": 1.565428376197815, - "learning_rate": 1.9140229968025058e-07, - "loss": 0.0191, - "step": 1794 - }, - { - "epoch": 8.75609756097561, - "grad_norm": 1.4710811376571655, - "learning_rate": 1.8993480304040912e-07, - "loss": 0.0114, - "step": 1795 - }, - { - "epoch": 8.760975609756098, - "grad_norm": 1.803842306137085, - "learning_rate": 1.8847273160584378e-07, - "loss": 0.0046, - "step": 1796 - }, - { - "epoch": 8.765853658536585, - "grad_norm": 0.694587230682373, - "learning_rate": 1.8701608881023957e-07, - "loss": 0.0014, - "step": 1797 - }, - { - "epoch": 8.770731707317074, - "grad_norm": 0.7563489675521851, - "learning_rate": 1.855648780745342e-07, - "loss": 0.0085, - "step": 1798 - }, - { - "epoch": 8.77560975609756, - "grad_norm": 1.1587045192718506, - "learning_rate": 1.8411910280690588e-07, - "loss": 0.0034, - "step": 1799 - }, - { - "epoch": 8.78048780487805, - "grad_norm": 1.7251181602478027, - "learning_rate": 1.826787664027685e-07, - "loss": 0.0119, - "step": 1800 - }, - { - "epoch": 8.785365853658536, - "grad_norm": 1.3170053958892822, - "learning_rate": 1.8124387224476347e-07, - "loss": 0.0059, - "step": 1801 - }, - { - "epoch": 8.790243902439025, - "grad_norm": 0.927018940448761, - "learning_rate": 1.7981442370274993e-07, - "loss": 0.0021, - "step": 1802 - }, - { - "epoch": 8.795121951219512, - "grad_norm": 2.3129045963287354, - "learning_rate": 1.783904241337983e-07, - "loss": 0.0085, - "step": 1803 - }, - { - "epoch": 8.8, - "grad_norm": 1.1010651588439941, - "learning_rate": 1.7697187688218291e-07, - "loss": 0.0037, - "step": 1804 - }, - { - "epoch": 8.804878048780488, - "grad_norm": 0.3990725576877594, - "learning_rate": 1.7555878527937164e-07, - "loss": 0.0008, - "step": 1805 - }, - { - "epoch": 8.809756097560976, - "grad_norm": 1.022905707359314, - "learning_rate": 1.7415115264402065e-07, - "loss": 0.0092, - "step": 1806 - }, - { - "epoch": 8.814634146341463, - "grad_norm": 0.7391730546951294, - "learning_rate": 1.727489822819664e-07, - "loss": 0.0016, - "step": 1807 - }, - { - "epoch": 8.819512195121952, - "grad_norm": 0.5859627723693848, - "learning_rate": 1.7135227748621585e-07, - "loss": 0.0012, - "step": 1808 - }, - { - "epoch": 8.824390243902439, - "grad_norm": 1.5222235918045044, - "learning_rate": 1.699610415369407e-07, - "loss": 0.0126, - "step": 1809 - }, - { - "epoch": 8.829268292682928, - "grad_norm": 0.8635048270225525, - "learning_rate": 1.6857527770146876e-07, - "loss": 0.0086, - "step": 1810 - }, - { - "epoch": 8.834146341463414, - "grad_norm": 0.8385710120201111, - "learning_rate": 1.6719498923427697e-07, - "loss": 0.0031, - "step": 1811 - }, - { - "epoch": 8.839024390243903, - "grad_norm": 1.0619077682495117, - "learning_rate": 1.6582017937698287e-07, - "loss": 0.0083, - "step": 1812 - }, - { - "epoch": 8.84390243902439, - "grad_norm": 0.6677606701850891, - "learning_rate": 1.6445085135833732e-07, - "loss": 0.002, - "step": 1813 - }, - { - "epoch": 8.848780487804879, - "grad_norm": 0.703705370426178, - "learning_rate": 1.6308700839421793e-07, - "loss": 0.0027, - "step": 1814 - }, - { - "epoch": 8.853658536585366, - "grad_norm": 0.7628077864646912, - "learning_rate": 1.6172865368762004e-07, - "loss": 0.0028, - "step": 1815 - }, - { - "epoch": 8.858536585365854, - "grad_norm": 0.7577258348464966, - "learning_rate": 1.6037579042864876e-07, - "loss": 0.0011, - "step": 1816 - }, - { - "epoch": 8.863414634146341, - "grad_norm": 1.2882269620895386, - "learning_rate": 1.5902842179451482e-07, - "loss": 0.0082, - "step": 1817 - }, - { - "epoch": 8.86829268292683, - "grad_norm": 1.030044436454773, - "learning_rate": 1.576865509495229e-07, - "loss": 0.0068, - "step": 1818 - }, - { - "epoch": 8.873170731707317, - "grad_norm": 1.9678841829299927, - "learning_rate": 1.5635018104506627e-07, - "loss": 0.0085, - "step": 1819 - }, - { - "epoch": 8.878048780487806, - "grad_norm": 0.756213366985321, - "learning_rate": 1.5501931521962055e-07, - "loss": 0.0062, - "step": 1820 - }, - { - "epoch": 8.882926829268293, - "grad_norm": 1.1753418445587158, - "learning_rate": 1.5369395659873305e-07, - "loss": 0.0043, - "step": 1821 - }, - { - "epoch": 8.887804878048781, - "grad_norm": 0.8144367933273315, - "learning_rate": 1.5237410829501864e-07, - "loss": 0.0042, - "step": 1822 - }, - { - "epoch": 8.892682926829268, - "grad_norm": 1.0879873037338257, - "learning_rate": 1.510597734081512e-07, - "loss": 0.0077, - "step": 1823 - }, - { - "epoch": 8.897560975609757, - "grad_norm": 1.7992119789123535, - "learning_rate": 1.497509550248555e-07, - "loss": 0.0013, - "step": 1824 - }, - { - "epoch": 8.902439024390244, - "grad_norm": 1.0460071563720703, - "learning_rate": 1.4844765621890135e-07, - "loss": 0.0091, - "step": 1825 - }, - { - "epoch": 8.907317073170733, - "grad_norm": 1.5372941493988037, - "learning_rate": 1.471498800510962e-07, - "loss": 0.005, - "step": 1826 - }, - { - "epoch": 8.91219512195122, - "grad_norm": 0.3672512173652649, - "learning_rate": 1.4585762956927624e-07, - "loss": 0.0014, - "step": 1827 - }, - { - "epoch": 8.917073170731708, - "grad_norm": 1.0456454753875732, - "learning_rate": 1.4457090780830185e-07, - "loss": 0.0063, - "step": 1828 - }, - { - "epoch": 8.921951219512195, - "grad_norm": 0.9190329909324646, - "learning_rate": 1.432897177900483e-07, - "loss": 0.0065, - "step": 1829 - }, - { - "epoch": 8.926829268292684, - "grad_norm": 1.8261685371398926, - "learning_rate": 1.4201406252340038e-07, - "loss": 0.0099, - "step": 1830 - }, - { - "epoch": 8.93170731707317, - "grad_norm": 1.1341190338134766, - "learning_rate": 1.407439450042433e-07, - "loss": 0.0042, - "step": 1831 - }, - { - "epoch": 8.93658536585366, - "grad_norm": 11.465933799743652, - "learning_rate": 1.3947936821545772e-07, - "loss": 0.004, - "step": 1832 - }, - { - "epoch": 8.941463414634146, - "grad_norm": 0.5747786164283752, - "learning_rate": 1.3822033512691209e-07, - "loss": 0.0009, - "step": 1833 - }, - { - "epoch": 8.946341463414633, - "grad_norm": 1.1908810138702393, - "learning_rate": 1.369668486954545e-07, - "loss": 0.0028, - "step": 1834 - }, - { - "epoch": 8.951219512195122, - "grad_norm": 0.2560107111930847, - "learning_rate": 1.3571891186490687e-07, - "loss": 0.001, - "step": 1835 - }, - { - "epoch": 8.95609756097561, - "grad_norm": 0.5070216059684753, - "learning_rate": 1.3447652756605894e-07, - "loss": 0.0024, - "step": 1836 - }, - { - "epoch": 8.960975609756098, - "grad_norm": 0.507199227809906, - "learning_rate": 1.3323969871665897e-07, - "loss": 0.0015, - "step": 1837 - }, - { - "epoch": 8.965853658536584, - "grad_norm": 0.29779553413391113, - "learning_rate": 1.3200842822140818e-07, - "loss": 0.0007, - "step": 1838 - }, - { - "epoch": 8.970731707317073, - "grad_norm": 0.4603523015975952, - "learning_rate": 1.3078271897195572e-07, - "loss": 0.0018, - "step": 1839 - }, - { - "epoch": 8.975609756097562, - "grad_norm": 1.0771223306655884, - "learning_rate": 1.2956257384688807e-07, - "loss": 0.0063, - "step": 1840 - }, - { - "epoch": 8.980487804878049, - "grad_norm": 0.798372745513916, - "learning_rate": 1.283479957117248e-07, - "loss": 0.002, - "step": 1841 - }, - { - "epoch": 8.985365853658536, - "grad_norm": 2.3283369541168213, - "learning_rate": 1.2713898741891244e-07, - "loss": 0.0398, - "step": 1842 - }, - { - "epoch": 8.990243902439024, - "grad_norm": 0.18683794140815735, - "learning_rate": 1.2593555180781591e-07, - "loss": 0.0004, - "step": 1843 - }, - { - "epoch": 8.995121951219513, - "grad_norm": 2.2289419174194336, - "learning_rate": 1.2473769170471188e-07, - "loss": 0.0713, - "step": 1844 - }, - { - "epoch": 9.0, - "grad_norm": 0.9360214471817017, - "learning_rate": 1.2354540992278452e-07, - "loss": 0.002, - "step": 1845 - }, - { - "epoch": 9.004878048780487, - "grad_norm": 0.11728485673666, - "learning_rate": 1.223587092621162e-07, - "loss": 0.0004, - "step": 1846 - }, - { - "epoch": 9.009756097560976, - "grad_norm": 2.8439087867736816, - "learning_rate": 1.2117759250968225e-07, - "loss": 0.0791, - "step": 1847 - }, - { - "epoch": 9.014634146341463, - "grad_norm": 0.3048456311225891, - "learning_rate": 1.2000206243934358e-07, - "loss": 0.0021, - "step": 1848 - }, - { - "epoch": 9.019512195121951, - "grad_norm": 0.35457128286361694, - "learning_rate": 1.1883212181184212e-07, - "loss": 0.0014, - "step": 1849 - }, - { - "epoch": 9.024390243902438, - "grad_norm": 0.4256647527217865, - "learning_rate": 1.176677733747919e-07, - "loss": 0.003, - "step": 1850 - }, - { - "epoch": 9.029268292682927, - "grad_norm": 0.14073246717453003, - "learning_rate": 1.1650901986267365e-07, - "loss": 0.0009, - "step": 1851 - }, - { - "epoch": 9.034146341463414, - "grad_norm": 0.2287226915359497, - "learning_rate": 1.1535586399682885e-07, - "loss": 0.001, - "step": 1852 - }, - { - "epoch": 9.039024390243902, - "grad_norm": 0.1520719975233078, - "learning_rate": 1.1420830848545256e-07, - "loss": 0.0008, - "step": 1853 - }, - { - "epoch": 9.04390243902439, - "grad_norm": 0.7066623568534851, - "learning_rate": 1.1306635602358673e-07, - "loss": 0.0086, - "step": 1854 - }, - { - "epoch": 9.048780487804878, - "grad_norm": 0.5992008447647095, - "learning_rate": 1.1193000929311638e-07, - "loss": 0.0023, - "step": 1855 - }, - { - "epoch": 9.053658536585365, - "grad_norm": 1.6487441062927246, - "learning_rate": 1.1079927096275978e-07, - "loss": 0.0235, - "step": 1856 - }, - { - "epoch": 9.058536585365854, - "grad_norm": 0.2044752836227417, - "learning_rate": 1.0967414368806384e-07, - "loss": 0.0013, - "step": 1857 - }, - { - "epoch": 9.06341463414634, - "grad_norm": 0.13774175941944122, - "learning_rate": 1.0855463011139905e-07, - "loss": 0.0005, - "step": 1858 - }, - { - "epoch": 9.06829268292683, - "grad_norm": 0.1757974475622177, - "learning_rate": 1.0744073286195089e-07, - "loss": 0.0011, - "step": 1859 - }, - { - "epoch": 9.073170731707316, - "grad_norm": 0.3755623698234558, - "learning_rate": 1.0633245455571511e-07, - "loss": 0.0015, - "step": 1860 - }, - { - "epoch": 9.078048780487805, - "grad_norm": 0.1744445413351059, - "learning_rate": 1.052297977954922e-07, - "loss": 0.001, - "step": 1861 - }, - { - "epoch": 9.082926829268292, - "grad_norm": 0.23634043335914612, - "learning_rate": 1.0413276517087956e-07, - "loss": 0.0005, - "step": 1862 - }, - { - "epoch": 9.08780487804878, - "grad_norm": 0.31559380888938904, - "learning_rate": 1.0304135925826603e-07, - "loss": 0.002, - "step": 1863 - }, - { - "epoch": 9.092682926829267, - "grad_norm": 0.44957175850868225, - "learning_rate": 1.0195558262082683e-07, - "loss": 0.0052, - "step": 1864 - }, - { - "epoch": 9.097560975609756, - "grad_norm": 0.23585057258605957, - "learning_rate": 1.0087543780851666e-07, - "loss": 0.0009, - "step": 1865 - }, - { - "epoch": 9.102439024390243, - "grad_norm": 0.26482903957366943, - "learning_rate": 9.98009273580633e-08, - "loss": 0.0017, - "step": 1866 - }, - { - "epoch": 9.107317073170732, - "grad_norm": 0.21670401096343994, - "learning_rate": 9.87320537929623e-08, - "loss": 0.0006, - "step": 1867 - }, - { - "epoch": 9.112195121951219, - "grad_norm": 0.11546074599027634, - "learning_rate": 9.766881962347208e-08, - "loss": 0.0006, - "step": 1868 - }, - { - "epoch": 9.117073170731707, - "grad_norm": 0.35039573907852173, - "learning_rate": 9.661122734660521e-08, - "loss": 0.0017, - "step": 1869 - }, - { - "epoch": 9.121951219512194, - "grad_norm": 0.25725650787353516, - "learning_rate": 9.555927944612492e-08, - "loss": 0.0015, - "step": 1870 - }, - { - "epoch": 9.126829268292683, - "grad_norm": 1.0865508317947388, - "learning_rate": 9.451297839253915e-08, - "loss": 0.0179, - "step": 1871 - }, - { - "epoch": 9.13170731707317, - "grad_norm": 0.29501980543136597, - "learning_rate": 9.34723266430937e-08, - "loss": 0.0013, - "step": 1872 - }, - { - "epoch": 9.136585365853659, - "grad_norm": 0.3127771019935608, - "learning_rate": 9.243732664176636e-08, - "loss": 0.0013, - "step": 1873 - }, - { - "epoch": 9.141463414634146, - "grad_norm": 0.47584185004234314, - "learning_rate": 9.140798081926277e-08, - "loss": 0.0042, - "step": 1874 - }, - { - "epoch": 9.146341463414634, - "grad_norm": 0.22509703040122986, - "learning_rate": 9.03842915930095e-08, - "loss": 0.0014, - "step": 1875 - }, - { - "epoch": 9.151219512195121, - "grad_norm": 0.2254130244255066, - "learning_rate": 8.936626136714754e-08, - "loss": 0.0014, - "step": 1876 - }, - { - "epoch": 9.15609756097561, - "grad_norm": 0.4184035360813141, - "learning_rate": 8.835389253252918e-08, - "loss": 0.0036, - "step": 1877 - }, - { - "epoch": 9.160975609756097, - "grad_norm": 0.8849661946296692, - "learning_rate": 8.734718746670978e-08, - "loss": 0.0195, - "step": 1878 - }, - { - "epoch": 9.165853658536586, - "grad_norm": 0.3465995192527771, - "learning_rate": 8.634614853394341e-08, - "loss": 0.0009, - "step": 1879 - }, - { - "epoch": 9.170731707317072, - "grad_norm": 0.5498316884040833, - "learning_rate": 8.53507780851781e-08, - "loss": 0.0028, - "step": 1880 - }, - { - "epoch": 9.175609756097561, - "grad_norm": 0.4553240239620209, - "learning_rate": 8.436107845804842e-08, - "loss": 0.0023, - "step": 1881 - }, - { - "epoch": 9.180487804878048, - "grad_norm": 0.3339614272117615, - "learning_rate": 8.33770519768709e-08, - "loss": 0.0008, - "step": 1882 - }, - { - "epoch": 9.185365853658537, - "grad_norm": 0.3212447762489319, - "learning_rate": 8.239870095263974e-08, - "loss": 0.0018, - "step": 1883 - }, - { - "epoch": 9.190243902439024, - "grad_norm": 0.2665475904941559, - "learning_rate": 8.142602768301921e-08, - "loss": 0.0016, - "step": 1884 - }, - { - "epoch": 9.195121951219512, - "grad_norm": 0.8188057541847229, - "learning_rate": 8.045903445233982e-08, - "loss": 0.0063, - "step": 1885 - }, - { - "epoch": 9.2, - "grad_norm": 0.458200603723526, - "learning_rate": 7.949772353159191e-08, - "loss": 0.0049, - "step": 1886 - }, - { - "epoch": 9.204878048780488, - "grad_norm": 0.50230872631073, - "learning_rate": 7.854209717842231e-08, - "loss": 0.0006, - "step": 1887 - }, - { - "epoch": 9.209756097560975, - "grad_norm": 0.12954330444335938, - "learning_rate": 7.759215763712579e-08, - "loss": 0.0007, - "step": 1888 - }, - { - "epoch": 9.214634146341464, - "grad_norm": 0.3889886438846588, - "learning_rate": 7.664790713864223e-08, - "loss": 0.0038, - "step": 1889 - }, - { - "epoch": 9.21951219512195, - "grad_norm": 0.8406491875648499, - "learning_rate": 7.57093479005519e-08, - "loss": 0.0059, - "step": 1890 - }, - { - "epoch": 9.22439024390244, - "grad_norm": 0.27930590510368347, - "learning_rate": 7.477648212706746e-08, - "loss": 0.0009, - "step": 1891 - }, - { - "epoch": 9.229268292682926, - "grad_norm": 0.2927345037460327, - "learning_rate": 7.384931200903084e-08, - "loss": 0.0019, - "step": 1892 - }, - { - "epoch": 9.234146341463415, - "grad_norm": 0.5030691027641296, - "learning_rate": 7.29278397239086e-08, - "loss": 0.0036, - "step": 1893 - }, - { - "epoch": 9.239024390243902, - "grad_norm": 0.14574876427650452, - "learning_rate": 7.20120674357852e-08, - "loss": 0.0005, - "step": 1894 - }, - { - "epoch": 9.24390243902439, - "grad_norm": 0.286927729845047, - "learning_rate": 7.110199729535805e-08, - "loss": 0.0009, - "step": 1895 - }, - { - "epoch": 9.248780487804877, - "grad_norm": 0.44844964146614075, - "learning_rate": 7.019763143993441e-08, - "loss": 0.0047, - "step": 1896 - }, - { - "epoch": 9.253658536585366, - "grad_norm": 0.16901901364326477, - "learning_rate": 6.929897199342395e-08, - "loss": 0.0006, - "step": 1897 - }, - { - "epoch": 9.258536585365853, - "grad_norm": 0.19660663604736328, - "learning_rate": 6.840602106633425e-08, - "loss": 0.0005, - "step": 1898 - }, - { - "epoch": 9.263414634146342, - "grad_norm": 0.2517840564250946, - "learning_rate": 6.751878075576867e-08, - "loss": 0.001, - "step": 1899 - }, - { - "epoch": 9.268292682926829, - "grad_norm": 0.6886439323425293, - "learning_rate": 6.663725314541652e-08, - "loss": 0.0046, - "step": 1900 - }, - { - "epoch": 9.273170731707317, - "grad_norm": 0.2044619917869568, - "learning_rate": 6.576144030555259e-08, - "loss": 0.0009, - "step": 1901 - }, - { - "epoch": 9.278048780487804, - "grad_norm": 0.5199993848800659, - "learning_rate": 6.489134429302906e-08, - "loss": 0.0038, - "step": 1902 - }, - { - "epoch": 9.282926829268293, - "grad_norm": 0.20676910877227783, - "learning_rate": 6.402696715127387e-08, - "loss": 0.0007, - "step": 1903 - }, - { - "epoch": 9.28780487804878, - "grad_norm": 0.13005101680755615, - "learning_rate": 6.316831091028237e-08, - "loss": 0.0005, - "step": 1904 - }, - { - "epoch": 9.292682926829269, - "grad_norm": 0.12870948016643524, - "learning_rate": 6.23153775866156e-08, - "loss": 0.0004, - "step": 1905 - }, - { - "epoch": 9.297560975609755, - "grad_norm": 0.4530372619628906, - "learning_rate": 6.14681691833935e-08, - "loss": 0.0027, - "step": 1906 - }, - { - "epoch": 9.302439024390244, - "grad_norm": 0.14936240017414093, - "learning_rate": 6.062668769029168e-08, - "loss": 0.0006, - "step": 1907 - }, - { - "epoch": 9.307317073170731, - "grad_norm": 1.0447592735290527, - "learning_rate": 5.979093508353489e-08, - "loss": 0.0033, - "step": 1908 - }, - { - "epoch": 9.31219512195122, - "grad_norm": 0.9839334487915039, - "learning_rate": 5.896091332589532e-08, - "loss": 0.0143, - "step": 1909 - }, - { - "epoch": 9.317073170731707, - "grad_norm": 0.13809092342853546, - "learning_rate": 5.813662436668477e-08, - "loss": 0.0006, - "step": 1910 - }, - { - "epoch": 9.321951219512195, - "grad_norm": 0.2679869830608368, - "learning_rate": 5.731807014175195e-08, - "loss": 0.0009, - "step": 1911 - }, - { - "epoch": 9.326829268292682, - "grad_norm": 0.09745966643095016, - "learning_rate": 5.650525257347744e-08, - "loss": 0.0004, - "step": 1912 - }, - { - "epoch": 9.331707317073171, - "grad_norm": 0.15892420709133148, - "learning_rate": 5.569817357076984e-08, - "loss": 0.0012, - "step": 1913 - }, - { - "epoch": 9.336585365853658, - "grad_norm": 0.9430788159370422, - "learning_rate": 5.489683502905935e-08, - "loss": 0.0039, - "step": 1914 - }, - { - "epoch": 9.341463414634147, - "grad_norm": 0.15283145010471344, - "learning_rate": 5.410123883029639e-08, - "loss": 0.001, - "step": 1915 - }, - { - "epoch": 9.346341463414634, - "grad_norm": 0.3713572025299072, - "learning_rate": 5.3311386842944125e-08, - "loss": 0.0017, - "step": 1916 - }, - { - "epoch": 9.351219512195122, - "grad_norm": 0.29313772916793823, - "learning_rate": 5.25272809219754e-08, - "loss": 0.0021, - "step": 1917 - }, - { - "epoch": 9.35609756097561, - "grad_norm": 0.219829261302948, - "learning_rate": 5.17489229088694e-08, - "loss": 0.001, - "step": 1918 - }, - { - "epoch": 9.360975609756098, - "grad_norm": 0.35704150795936584, - "learning_rate": 5.097631463160585e-08, - "loss": 0.002, - "step": 1919 - }, - { - "epoch": 9.365853658536585, - "grad_norm": 0.44924139976501465, - "learning_rate": 5.020945790466025e-08, - "loss": 0.0007, - "step": 1920 - }, - { - "epoch": 9.370731707317074, - "grad_norm": 0.10656553506851196, - "learning_rate": 4.944835452900199e-08, - "loss": 0.0005, - "step": 1921 - }, - { - "epoch": 9.37560975609756, - "grad_norm": 0.8902695178985596, - "learning_rate": 4.869300629208762e-08, - "loss": 0.0084, - "step": 1922 - }, - { - "epoch": 9.38048780487805, - "grad_norm": 0.3544962406158447, - "learning_rate": 4.7943414967858426e-08, - "loss": 0.0024, - "step": 1923 - }, - { - "epoch": 9.385365853658536, - "grad_norm": 0.3950733244419098, - "learning_rate": 4.7199582316734827e-08, - "loss": 0.0006, - "step": 1924 - }, - { - "epoch": 9.390243902439025, - "grad_norm": 0.45972177386283875, - "learning_rate": 4.6461510085613616e-08, - "loss": 0.0012, - "step": 1925 - }, - { - "epoch": 9.395121951219512, - "grad_norm": 0.12690195441246033, - "learning_rate": 4.5729200007862686e-08, - "loss": 0.0004, - "step": 1926 - }, - { - "epoch": 9.4, - "grad_norm": 0.2692466974258423, - "learning_rate": 4.5002653803317975e-08, - "loss": 0.0016, - "step": 1927 - }, - { - "epoch": 9.404878048780487, - "grad_norm": 0.5844394564628601, - "learning_rate": 4.428187317827848e-08, - "loss": 0.001, - "step": 1928 - }, - { - "epoch": 9.409756097560976, - "grad_norm": 0.5482091307640076, - "learning_rate": 4.356685982550263e-08, - "loss": 0.0016, - "step": 1929 - }, - { - "epoch": 9.414634146341463, - "grad_norm": 0.06951025128364563, - "learning_rate": 4.285761542420497e-08, - "loss": 0.0004, - "step": 1930 - }, - { - "epoch": 9.419512195121952, - "grad_norm": 0.1519978791475296, - "learning_rate": 4.215414164005116e-08, - "loss": 0.0007, - "step": 1931 - }, - { - "epoch": 9.424390243902439, - "grad_norm": 0.3855389654636383, - "learning_rate": 4.145644012515465e-08, - "loss": 0.0012, - "step": 1932 - }, - { - "epoch": 9.429268292682927, - "grad_norm": 0.28962311148643494, - "learning_rate": 4.076451251807223e-08, - "loss": 0.0024, - "step": 1933 - }, - { - "epoch": 9.434146341463414, - "grad_norm": 0.12305665761232376, - "learning_rate": 4.0078360443801535e-08, - "loss": 0.0005, - "step": 1934 - }, - { - "epoch": 9.439024390243903, - "grad_norm": 0.5113069415092468, - "learning_rate": 3.9397985513775495e-08, - "loss": 0.0007, - "step": 1935 - }, - { - "epoch": 9.44390243902439, - "grad_norm": 0.11020799726247787, - "learning_rate": 3.872338932585984e-08, - "loss": 0.0006, - "step": 1936 - }, - { - "epoch": 9.448780487804878, - "grad_norm": 0.24607239663600922, - "learning_rate": 3.8054573464348655e-08, - "loss": 0.0012, - "step": 1937 - }, - { - "epoch": 9.453658536585365, - "grad_norm": 0.09522794187068939, - "learning_rate": 3.739153949996105e-08, - "loss": 0.0004, - "step": 1938 - }, - { - "epoch": 9.458536585365854, - "grad_norm": 0.3217187523841858, - "learning_rate": 3.6734288989836994e-08, - "loss": 0.0018, - "step": 1939 - }, - { - "epoch": 9.463414634146341, - "grad_norm": 0.10770946741104126, - "learning_rate": 3.608282347753428e-08, - "loss": 0.0005, - "step": 1940 - }, - { - "epoch": 9.46829268292683, - "grad_norm": 0.18529640138149261, - "learning_rate": 3.543714449302488e-08, - "loss": 0.0014, - "step": 1941 - }, - { - "epoch": 9.473170731707317, - "grad_norm": 0.3584231436252594, - "learning_rate": 3.479725355268998e-08, - "loss": 0.0011, - "step": 1942 - }, - { - "epoch": 9.478048780487805, - "grad_norm": 0.3854292035102844, - "learning_rate": 3.4163152159318866e-08, - "loss": 0.0019, - "step": 1943 - }, - { - "epoch": 9.482926829268292, - "grad_norm": 0.08858831971883774, - "learning_rate": 3.353484180210337e-08, - "loss": 0.0003, - "step": 1944 - }, - { - "epoch": 9.487804878048781, - "grad_norm": 0.5076143741607666, - "learning_rate": 3.291232395663424e-08, - "loss": 0.0057, - "step": 1945 - }, - { - "epoch": 9.492682926829268, - "grad_norm": 0.38053473830223083, - "learning_rate": 3.229560008490007e-08, - "loss": 0.0021, - "step": 1946 - }, - { - "epoch": 9.497560975609757, - "grad_norm": 1.4997718334197998, - "learning_rate": 3.168467163528116e-08, - "loss": 0.0079, - "step": 1947 - }, - { - "epoch": 9.502439024390243, - "grad_norm": 0.7466314435005188, - "learning_rate": 3.1079540042547315e-08, - "loss": 0.0077, - "step": 1948 - }, - { - "epoch": 9.507317073170732, - "grad_norm": 0.11087851971387863, - "learning_rate": 3.0480206727855066e-08, - "loss": 0.0004, - "step": 1949 - }, - { - "epoch": 9.512195121951219, - "grad_norm": 0.2965907156467438, - "learning_rate": 2.988667309874294e-08, - "loss": 0.0015, - "step": 1950 - }, - { - "epoch": 9.517073170731708, - "grad_norm": 1.4327231645584106, - "learning_rate": 2.9298940549128962e-08, - "loss": 0.0132, - "step": 1951 - }, - { - "epoch": 9.521951219512195, - "grad_norm": 0.9336621165275574, - "learning_rate": 2.871701045930708e-08, - "loss": 0.0019, - "step": 1952 - }, - { - "epoch": 9.526829268292683, - "grad_norm": 1.2587624788284302, - "learning_rate": 2.8140884195945184e-08, - "loss": 0.0024, - "step": 1953 - }, - { - "epoch": 9.53170731707317, - "grad_norm": 0.13109427690505981, - "learning_rate": 2.7570563112079564e-08, - "loss": 0.001, - "step": 1954 - }, - { - "epoch": 9.536585365853659, - "grad_norm": 0.2514895796775818, - "learning_rate": 2.700604854711353e-08, - "loss": 0.0013, - "step": 1955 - }, - { - "epoch": 9.541463414634146, - "grad_norm": 0.3432636857032776, - "learning_rate": 2.6447341826814077e-08, - "loss": 0.0005, - "step": 1956 - }, - { - "epoch": 9.546341463414635, - "grad_norm": 0.4550987780094147, - "learning_rate": 2.5894444263307728e-08, - "loss": 0.001, - "step": 1957 - }, - { - "epoch": 9.551219512195122, - "grad_norm": 1.2675397396087646, - "learning_rate": 2.5347357155078577e-08, - "loss": 0.0103, - "step": 1958 - }, - { - "epoch": 9.55609756097561, - "grad_norm": 0.1289552003145218, - "learning_rate": 2.4806081786964974e-08, - "loss": 0.0006, - "step": 1959 - }, - { - "epoch": 9.560975609756097, - "grad_norm": 0.6298596858978271, - "learning_rate": 2.4270619430156183e-08, - "loss": 0.0019, - "step": 1960 - }, - { - "epoch": 9.565853658536586, - "grad_norm": 0.538487434387207, - "learning_rate": 2.3740971342189056e-08, - "loss": 0.0027, - "step": 1961 - }, - { - "epoch": 9.570731707317073, - "grad_norm": 0.8478948473930359, - "learning_rate": 2.321713876694637e-08, - "loss": 0.0133, - "step": 1962 - }, - { - "epoch": 9.575609756097561, - "grad_norm": 1.0609294176101685, - "learning_rate": 2.269912293465293e-08, - "loss": 0.008, - "step": 1963 - }, - { - "epoch": 9.580487804878048, - "grad_norm": 0.634739100933075, - "learning_rate": 2.2186925061872532e-08, - "loss": 0.0025, - "step": 1964 - }, - { - "epoch": 9.585365853658537, - "grad_norm": 0.43630343675613403, - "learning_rate": 2.1680546351506016e-08, - "loss": 0.003, - "step": 1965 - }, - { - "epoch": 9.590243902439024, - "grad_norm": 0.3712899684906006, - "learning_rate": 2.117998799278709e-08, - "loss": 0.0036, - "step": 1966 - }, - { - "epoch": 9.595121951219513, - "grad_norm": 0.13679739832878113, - "learning_rate": 2.068525116128095e-08, - "loss": 0.0006, - "step": 1967 - }, - { - "epoch": 9.6, - "grad_norm": 1.8157588243484497, - "learning_rate": 2.0196337018880962e-08, - "loss": 0.0659, - "step": 1968 - }, - { - "epoch": 9.604878048780488, - "grad_norm": 0.07176486402750015, - "learning_rate": 1.9713246713805588e-08, - "loss": 0.0003, - "step": 1969 - }, - { - "epoch": 9.609756097560975, - "grad_norm": 0.33367958664894104, - "learning_rate": 1.9235981380595625e-08, - "loss": 0.0013, - "step": 1970 - }, - { - "epoch": 9.614634146341464, - "grad_norm": 0.08895006775856018, - "learning_rate": 1.876454214011253e-08, - "loss": 0.0004, - "step": 1971 - }, - { - "epoch": 9.61951219512195, - "grad_norm": 0.2062547653913498, - "learning_rate": 1.8298930099534817e-08, - "loss": 0.001, - "step": 1972 - }, - { - "epoch": 9.62439024390244, - "grad_norm": 0.1351477950811386, - "learning_rate": 1.783914635235584e-08, - "loss": 0.0006, - "step": 1973 - }, - { - "epoch": 9.629268292682926, - "grad_norm": 0.5446783304214478, - "learning_rate": 1.738519197838101e-08, - "loss": 0.0038, - "step": 1974 - }, - { - "epoch": 9.634146341463415, - "grad_norm": 0.12655134499073029, - "learning_rate": 1.6937068043725856e-08, - "loss": 0.0006, - "step": 1975 - }, - { - "epoch": 9.639024390243902, - "grad_norm": 0.7479956150054932, - "learning_rate": 1.6494775600812418e-08, - "loss": 0.0026, - "step": 1976 - }, - { - "epoch": 9.64390243902439, - "grad_norm": 0.39983221888542175, - "learning_rate": 1.6058315688367852e-08, - "loss": 0.003, - "step": 1977 - }, - { - "epoch": 9.648780487804878, - "grad_norm": 0.2727876305580139, - "learning_rate": 1.5627689331421946e-08, - "loss": 0.0015, - "step": 1978 - }, - { - "epoch": 9.653658536585366, - "grad_norm": 0.17525868117809296, - "learning_rate": 1.520289754130322e-08, - "loss": 0.001, - "step": 1979 - }, - { - "epoch": 9.658536585365853, - "grad_norm": 0.2446790337562561, - "learning_rate": 1.478394131563865e-08, - "loss": 0.0011, - "step": 1980 - }, - { - "epoch": 9.663414634146342, - "grad_norm": 0.37458178400993347, - "learning_rate": 1.4370821638350353e-08, - "loss": 0.0022, - "step": 1981 - }, - { - "epoch": 9.668292682926829, - "grad_norm": 0.1664375215768814, - "learning_rate": 1.396353947965251e-08, - "loss": 0.0006, - "step": 1982 - }, - { - "epoch": 9.673170731707318, - "grad_norm": 0.08668441325426102, - "learning_rate": 1.3562095796050279e-08, - "loss": 0.0003, - "step": 1983 - }, - { - "epoch": 9.678048780487805, - "grad_norm": 0.2897089719772339, - "learning_rate": 1.3166491530337555e-08, - "loss": 0.001, - "step": 1984 - }, - { - "epoch": 9.682926829268293, - "grad_norm": 0.21582652628421783, - "learning_rate": 1.2776727611593653e-08, - "loss": 0.0007, - "step": 1985 - }, - { - "epoch": 9.68780487804878, - "grad_norm": 0.3643123507499695, - "learning_rate": 1.2392804955181915e-08, - "loss": 0.002, - "step": 1986 - }, - { - "epoch": 9.692682926829269, - "grad_norm": 0.5870813131332397, - "learning_rate": 1.2014724462747763e-08, - "loss": 0.0016, - "step": 1987 - }, - { - "epoch": 9.697560975609756, - "grad_norm": 0.19344697892665863, - "learning_rate": 1.1642487022215931e-08, - "loss": 0.0008, - "step": 1988 - }, - { - "epoch": 9.702439024390245, - "grad_norm": 0.15417703986167908, - "learning_rate": 1.1276093507788798e-08, - "loss": 0.001, - "step": 1989 - }, - { - "epoch": 9.707317073170731, - "grad_norm": 0.2714616358280182, - "learning_rate": 1.0915544779944164e-08, - "loss": 0.0022, - "step": 1990 - }, - { - "epoch": 9.71219512195122, - "grad_norm": 0.14375440776348114, - "learning_rate": 1.0560841685433864e-08, - "loss": 0.0008, - "step": 1991 - }, - { - "epoch": 9.717073170731707, - "grad_norm": 0.19977939128875732, - "learning_rate": 1.021198505728016e-08, - "loss": 0.0011, - "step": 1992 - }, - { - "epoch": 9.721951219512196, - "grad_norm": 0.20787867903709412, - "learning_rate": 9.868975714775741e-09, - "loss": 0.0012, - "step": 1993 - }, - { - "epoch": 9.726829268292683, - "grad_norm": 0.3878643810749054, - "learning_rate": 9.531814463480394e-09, - "loss": 0.0008, - "step": 1994 - }, - { - "epoch": 9.731707317073171, - "grad_norm": 0.5140596032142639, - "learning_rate": 9.200502095220166e-09, - "loss": 0.0034, - "step": 1995 - }, - { - "epoch": 9.736585365853658, - "grad_norm": 0.2106190174818039, - "learning_rate": 8.875039388084317e-09, - "loss": 0.0008, - "step": 1996 - }, - { - "epoch": 9.741463414634147, - "grad_norm": 0.09516038745641708, - "learning_rate": 8.555427106424485e-09, - "loss": 0.0005, - "step": 1997 - }, - { - "epoch": 9.746341463414634, - "grad_norm": 1.439642310142517, - "learning_rate": 8.241666000852466e-09, - "loss": 0.0314, - "step": 1998 - }, - { - "epoch": 9.751219512195123, - "grad_norm": 0.10020413249731064, - "learning_rate": 7.933756808238823e-09, - "loss": 0.0004, - "step": 1999 - }, - { - "epoch": 9.75609756097561, - "grad_norm": 0.4296906888484955, - "learning_rate": 7.631700251710116e-09, - "loss": 0.0022, - "step": 2000 - }, - { - "epoch": 9.760975609756098, - "grad_norm": 0.4867343604564667, - "learning_rate": 7.335497040648898e-09, - "loss": 0.0024, - "step": 2001 - }, - { - "epoch": 9.765853658536585, - "grad_norm": 3.0838112831115723, - "learning_rate": 7.045147870690105e-09, - "loss": 0.0796, - "step": 2002 - }, - { - "epoch": 9.770731707317074, - "grad_norm": 0.26949402689933777, - "learning_rate": 6.760653423721619e-09, - "loss": 0.0012, - "step": 2003 - }, - { - "epoch": 9.77560975609756, - "grad_norm": 0.854682445526123, - "learning_rate": 6.4820143678800964e-09, - "loss": 0.0059, - "step": 2004 - }, - { - "epoch": 9.78048780487805, - "grad_norm": 0.06472957879304886, - "learning_rate": 6.209231357551526e-09, - "loss": 0.0003, - "step": 2005 - }, - { - "epoch": 9.785365853658536, - "grad_norm": 0.9941632747650146, - "learning_rate": 5.942305033369289e-09, - "loss": 0.0113, - "step": 2006 - }, - { - "epoch": 9.790243902439025, - "grad_norm": 0.08150490373373032, - "learning_rate": 5.681236022211378e-09, - "loss": 0.0003, - "step": 2007 - }, - { - "epoch": 9.795121951219512, - "grad_norm": 0.37303054332733154, - "learning_rate": 5.426024937200402e-09, - "loss": 0.0021, - "step": 2008 - }, - { - "epoch": 9.8, - "grad_norm": 0.12861268222332, - "learning_rate": 5.176672377701364e-09, - "loss": 0.0004, - "step": 2009 - }, - { - "epoch": 9.804878048780488, - "grad_norm": 0.13954521715641022, - "learning_rate": 4.933178929321103e-09, - "loss": 0.0006, - "step": 2010 - }, - { - "epoch": 9.809756097560976, - "grad_norm": 0.8102789521217346, - "learning_rate": 4.695545163905524e-09, - "loss": 0.0047, - "step": 2011 - }, - { - "epoch": 9.814634146341463, - "grad_norm": 0.8437443971633911, - "learning_rate": 4.463771639539038e-09, - "loss": 0.0013, - "step": 2012 - }, - { - "epoch": 9.819512195121952, - "grad_norm": 0.3098134994506836, - "learning_rate": 4.237858900543734e-09, - "loss": 0.0025, - "step": 2013 - }, - { - "epoch": 9.824390243902439, - "grad_norm": 0.7686973214149475, - "learning_rate": 4.017807477477154e-09, - "loss": 0.0045, - "step": 2014 - }, - { - "epoch": 9.829268292682928, - "grad_norm": 0.45219677686691284, - "learning_rate": 3.803617887132016e-09, - "loss": 0.0017, - "step": 2015 - }, - { - "epoch": 9.834146341463414, - "grad_norm": 0.529446542263031, - "learning_rate": 3.5952906325339988e-09, - "loss": 0.0043, - "step": 2016 - }, - { - "epoch": 9.839024390243903, - "grad_norm": 0.35920700430870056, - "learning_rate": 3.3928262029411794e-09, - "loss": 0.0025, - "step": 2017 - }, - { - "epoch": 9.84390243902439, - "grad_norm": 0.3075787127017975, - "learning_rate": 3.196225073842929e-09, - "loss": 0.0025, - "step": 2018 - }, - { - "epoch": 9.848780487804879, - "grad_norm": 0.1374140977859497, - "learning_rate": 3.005487706958243e-09, - "loss": 0.0005, - "step": 2019 - }, - { - "epoch": 9.853658536585366, - "grad_norm": 0.5697541236877441, - "learning_rate": 2.8206145502354678e-09, - "loss": 0.0026, - "step": 2020 - }, - { - "epoch": 9.858536585365854, - "grad_norm": 1.0206952095031738, - "learning_rate": 2.641606037850353e-09, - "loss": 0.0101, - "step": 2021 - }, - { - "epoch": 9.863414634146341, - "grad_norm": 0.29209089279174805, - "learning_rate": 2.468462590205778e-09, - "loss": 0.0022, - "step": 2022 - }, - { - "epoch": 9.86829268292683, - "grad_norm": 0.13821417093276978, - "learning_rate": 2.3011846139306404e-09, - "loss": 0.0006, - "step": 2023 - }, - { - "epoch": 9.873170731707317, - "grad_norm": 0.4531463086605072, - "learning_rate": 2.13977250187819e-09, - "loss": 0.0018, - "step": 2024 - }, - { - "epoch": 9.878048780487806, - "grad_norm": 0.4380701184272766, - "learning_rate": 1.9842266331260296e-09, - "loss": 0.0031, - "step": 2025 - }, - { - "epoch": 9.882926829268293, - "grad_norm": 0.33851730823516846, - "learning_rate": 1.834547372975004e-09, - "loss": 0.0013, - "step": 2026 - }, - { - "epoch": 9.887804878048781, - "grad_norm": 0.4231720566749573, - "learning_rate": 1.6907350729478133e-09, - "loss": 0.0024, - "step": 2027 - }, - { - "epoch": 9.892682926829268, - "grad_norm": 0.4602144658565521, - "learning_rate": 1.5527900707887344e-09, - "loss": 0.004, - "step": 2028 - }, - { - "epoch": 9.897560975609757, - "grad_norm": 0.9638814330101013, - "learning_rate": 1.4207126904625114e-09, - "loss": 0.0097, - "step": 2029 - }, - { - "epoch": 9.902439024390244, - "grad_norm": 0.1374921053647995, - "learning_rate": 1.2945032421540771e-09, - "loss": 0.0005, - "step": 2030 - }, - { - "epoch": 9.907317073170733, - "grad_norm": 0.45432549715042114, - "learning_rate": 1.1741620222671667e-09, - "loss": 0.0041, - "step": 2031 - }, - { - "epoch": 9.91219512195122, - "grad_norm": 0.1905360370874405, - "learning_rate": 1.0596893134240394e-09, - "loss": 0.0007, - "step": 2032 - }, - { - "epoch": 9.917073170731708, - "grad_norm": 0.41532090306282043, - "learning_rate": 9.51085384464645e-10, - "loss": 0.007, - "step": 2033 - }, - { - "epoch": 9.921951219512195, - "grad_norm": 0.5167479515075684, - "learning_rate": 8.48350490446348e-10, - "loss": 0.0018, - "step": 2034 - }, - { - "epoch": 9.926829268292684, - "grad_norm": 0.4736052453517914, - "learning_rate": 7.514848726422608e-10, - "loss": 0.0028, - "step": 2035 - }, - { - "epoch": 9.93170731707317, - "grad_norm": 0.19710786640644073, - "learning_rate": 6.604887585426323e-10, - "loss": 0.0009, - "step": 2036 - }, - { - "epoch": 9.93658536585366, - "grad_norm": 0.4432890713214874, - "learning_rate": 5.753623618520721e-10, - "loss": 0.0049, - "step": 2037 - }, - { - "epoch": 9.941463414634146, - "grad_norm": 1.073500394821167, - "learning_rate": 4.961058824909382e-10, - "loss": 0.0044, - "step": 2038 - }, - { - "epoch": 9.946341463414633, - "grad_norm": 0.17539291083812714, - "learning_rate": 4.2271950659311665e-10, - "loss": 0.0005, - "step": 2039 - }, - { - "epoch": 9.951219512195122, - "grad_norm": 0.3674123287200928, - "learning_rate": 3.5520340650768705e-10, - "loss": 0.0012, - "step": 2040 - }, - { - "epoch": 9.95609756097561, - "grad_norm": 0.20704206824302673, - "learning_rate": 2.9355774079614653e-10, - "loss": 0.0007, - "step": 2041 - }, - { - "epoch": 9.960975609756098, - "grad_norm": 0.07566344738006592, - "learning_rate": 2.377826542343531e-10, - "loss": 0.0003, - "step": 2042 - }, - { - "epoch": 9.965853658536584, - "grad_norm": 0.1342095583677292, - "learning_rate": 1.8787827781002743e-10, - "loss": 0.0005, - "step": 2043 - }, - { - "epoch": 9.970731707317073, - "grad_norm": 0.7139898538589478, - "learning_rate": 1.4384472872414067e-10, - "loss": 0.0013, - "step": 2044 - }, - { - "epoch": 9.975609756097562, - "grad_norm": 0.11133516579866409, - "learning_rate": 1.056821103900818e-10, - "loss": 0.0006, - "step": 2045 - }, - { - "epoch": 9.980487804878049, - "grad_norm": 0.16971242427825928, - "learning_rate": 7.339051243254735e-11, - "loss": 0.0011, - "step": 2046 - }, - { - "epoch": 9.985365853658536, - "grad_norm": 1.3622301816940308, - "learning_rate": 4.697001068892926e-11, - "loss": 0.0202, - "step": 2047 - }, - { - "epoch": 9.990243902439024, - "grad_norm": 0.10895299166440964, - "learning_rate": 2.642066720792702e-11, - "loss": 0.0005, - "step": 2048 - }, - { - "epoch": 9.995121951219513, - "grad_norm": 0.32567188143730164, - "learning_rate": 1.1742530249547745e-11, - "loss": 0.0017, - "step": 2049 - }, - { - "epoch": 10.0, - "grad_norm": 0.06682642549276352, - "learning_rate": 2.9356342859387933e-12, - "loss": 0.0002, - "step": 2050 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 5.892331269877924e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-410/chat_template.jinja b/metallama3_8b/limo/checkpoint-410/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-410/config.json b/metallama3_8b/limo/checkpoint-410/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-410/generation_config.json b/metallama3_8b/limo/checkpoint-410/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-410/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-410/model-00001-of-00007.safetensors deleted file mode 100644 index f93681a4df0ad6cf57d4b940325c70057b32c3fa..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:888055dfa5e00980b5ab1d819598aaca372fdfd3d925b167dd23afc907ab13ab -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-410/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-410/model-00002-of-00007.safetensors deleted file mode 100644 index 19de92636e7e86f421a9c1afc34e7dc99e5219c7..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:80fb26643a92ada1004c5c00c5c20f5cd1fd01d2b9e84334dc72232602f6846b -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-410/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-410/model-00003-of-00007.safetensors deleted file mode 100644 index 2200c9bb2c3d6c5eed61bfa1b850487cf564c9d5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e942615d49f459c7f355eace25d246fe252133b72ee2bb498af3e6017a10c8d1 -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-410/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-410/model-00004-of-00007.safetensors deleted file mode 100644 index bc424672b6bac69c5026ae0a0d5748bf196d494e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8af086f5e9608aed8d264ebcd5ba574284d4829f93a8e83f820621d9b49948a1 -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-410/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-410/model-00005-of-00007.safetensors deleted file mode 100644 index 40b95a183e238d79a55d26b1c6ed7dbafc0b02da..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad3ca596196e33417eeee1ac1ca6d76c136ca8b133b27f088c2ada87e9985f54 -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-410/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-410/model-00006-of-00007.safetensors deleted file mode 100644 index 259054dfb342aa6f9bb2a443e23bfa23460e365f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e46f9babc8ff0b58903cc3f3c1b1b5f48d2b7aec89a83cc9c3f494e9df9021fb -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-410/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-410/model-00007-of-00007.safetensors deleted file mode 100644 index 8722c80b18c4017a77ecf064ed4441dc19f151ef..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:35051ac7f0233dffe9506767660407bdc57fcd45f4c531321639dc9c0da6f24b -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-410/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-410/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-410/rng_state_0.pth b/metallama3_8b/limo/checkpoint-410/rng_state_0.pth deleted file mode 100644 index 37ac50652a3badbfb1bdeaccb8b1934575b584eb..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad -size 15024 diff --git a/metallama3_8b/limo/checkpoint-410/rng_state_1.pth b/metallama3_8b/limo/checkpoint-410/rng_state_1.pth deleted file mode 100644 index 0bc3650851dae439677613c9e23a5528de47b679..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-410/rng_state_2.pth b/metallama3_8b/limo/checkpoint-410/rng_state_2.pth deleted file mode 100644 index 0e00a6e8b4b743026f68d749a8cb3bdd4b746838..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d -size 15024 diff --git a/metallama3_8b/limo/checkpoint-410/rng_state_3.pth b/metallama3_8b/limo/checkpoint-410/rng_state_3.pth deleted file mode 100644 index 5354141d42e077c356f9ca8c6b12bd7e5e41f2af..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-410/scheduler.pt b/metallama3_8b/limo/checkpoint-410/scheduler.pt deleted file mode 100644 index 0dc2324ef1e58baf86b20dc0f49e8f9d19569ac2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64b87c070356d49af38984aef4437a4edcf8b6880ba0aec9af0e093c40c21ff3 -size 1064 diff --git a/metallama3_8b/limo/checkpoint-410/special_tokens_map.json b/metallama3_8b/limo/checkpoint-410/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-410/tokenizer.json b/metallama3_8b/limo/checkpoint-410/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-410/tokenizer_config.json b/metallama3_8b/limo/checkpoint-410/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-410/trainer_state.json b/metallama3_8b/limo/checkpoint-410/trainer_state.json deleted file mode 100644 index d6d6254a509c5deae7c8c6c0b2fe5b971d912cce..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-410/trainer_state.json +++ /dev/null @@ -1,2904 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.0, - "eval_steps": 500, - "global_step": 410, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.1816307447247667e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-615/chat_template.jinja b/metallama3_8b/limo/checkpoint-615/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-615/config.json b/metallama3_8b/limo/checkpoint-615/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-615/generation_config.json b/metallama3_8b/limo/checkpoint-615/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-615/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-615/model-00001-of-00007.safetensors deleted file mode 100644 index ae71b6b4561464dd5fa4176637c97f469dae412f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:abf8ce47a84c67095d337e1d358d5c28c38ec4961f18c3023b2668ccb88e46b8 -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-615/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-615/model-00002-of-00007.safetensors deleted file mode 100644 index 9668bb6f3f319749a3035ad96228c6b0ac63c1ee..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eed74f859e9ddfa2a21db0a727c32be71e777c78e775f7e864db679e8fae33ed -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-615/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-615/model-00003-of-00007.safetensors deleted file mode 100644 index ffa180374a7a815f1ecba9de946db070dab21381..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8fcaa46121b6c58a6e18664f9baf7b4ab392d1a91ac00c0f25981ed97c2a8eb0 -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-615/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-615/model-00004-of-00007.safetensors deleted file mode 100644 index b4a83f1d9c59b057ffb529fbf87205f7c3ae344d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75a15f9e54c185395adfa34f5a4055841a4788452b52f035b5cb21f51585e81c -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-615/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-615/model-00005-of-00007.safetensors deleted file mode 100644 index 3f5fb0ea1541f32d4e7a2988efa709dddb400310..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6e39f0324ac52e8b9a58a4bbfab36e1d70c23dac114c81d608bcba12ee8c62f -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-615/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-615/model-00006-of-00007.safetensors deleted file mode 100644 index b0814ee28a5dc65dfafac7254ff961c9762b4ee5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5ffa038293adc70f6d24f9f072b110dd54fdce020b59090603b98d5f4d7c9bc -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-615/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-615/model-00007-of-00007.safetensors deleted file mode 100644 index c7cd50447f4789a6c04f5e0d176d0a43f1545405..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:797b4ff1d93def9f5b9c0816d8dd99f8218caba233bba66f56c3045fe5115af3 -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-615/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-615/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-615/rng_state_0.pth b/metallama3_8b/limo/checkpoint-615/rng_state_0.pth deleted file mode 100644 index 5a7c482c30381cd512ccc35fe322d8a34fbf5207..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:308f94f9a5c24e1bad5c393d56ae7af7782600f4e791d9c6ac35b22fff2105b6 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-615/rng_state_1.pth b/metallama3_8b/limo/checkpoint-615/rng_state_1.pth deleted file mode 100644 index 7b862c21b28bbd89ce6b4fb681d41be05f175599..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b056f3c23cb32dc77a2ec9e7651e0b64e4440e21f0fdf969b86bfc56a1cbdf06 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-615/rng_state_2.pth b/metallama3_8b/limo/checkpoint-615/rng_state_2.pth deleted file mode 100644 index d86ce886844e0298f058d67065e5eeb27ffe7e48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3f8a05714bc528f4885a2816181652f2303b3e8150f89b56aaee6bec56aa520 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-615/rng_state_3.pth b/metallama3_8b/limo/checkpoint-615/rng_state_3.pth deleted file mode 100644 index 10733f5da657367adf3f67760028644c0839660f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f755bd3c330281961e5c03af9d10ce8c1e1678619d384f6f1fd5fd7dce2ff50 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-615/scheduler.pt b/metallama3_8b/limo/checkpoint-615/scheduler.pt deleted file mode 100644 index cfac68a12ae3c4e6fc3485a272cb00faab2da6fe..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0798d27e28479e1e73a7711d8c42eea003960930f935328e8e1e4ee0e0f02f04 -size 1064 diff --git a/metallama3_8b/limo/checkpoint-615/special_tokens_map.json b/metallama3_8b/limo/checkpoint-615/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-615/tokenizer.json b/metallama3_8b/limo/checkpoint-615/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-615/tokenizer_config.json b/metallama3_8b/limo/checkpoint-615/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-615/trainer_state.json b/metallama3_8b/limo/checkpoint-615/trainer_state.json deleted file mode 100644 index 4c8b05424744f0e0b62402235b5fbfd519961225..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-615/trainer_state.json +++ /dev/null @@ -1,4339 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 3.0, - "eval_steps": 500, - "global_step": 615, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.7702898878539366e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/checkpoint-820/chat_template.jinja b/metallama3_8b/limo/checkpoint-820/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo/checkpoint-820/config.json b/metallama3_8b/limo/checkpoint-820/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/checkpoint-820/generation_config.json b/metallama3_8b/limo/checkpoint-820/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/checkpoint-820/model-00001-of-00007.safetensors b/metallama3_8b/limo/checkpoint-820/model-00001-of-00007.safetensors deleted file mode 100644 index 53d5fbd892090b80b174cc3b76977d28c694184e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa1c4c74f9bac680450c720bdd9683ce6cf52b29aecdc80f1b0f66dccddf83f8 -size 4886466168 diff --git a/metallama3_8b/limo/checkpoint-820/model-00002-of-00007.safetensors b/metallama3_8b/limo/checkpoint-820/model-00002-of-00007.safetensors deleted file mode 100644 index 847779061bea50a2e4a6273a2fbcd5e0308c6022..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1e34dcb37b9660239d6f618d9c8bd05fdd23a957b3a28e44a6f9683885069077 -size 4832007448 diff --git a/metallama3_8b/limo/checkpoint-820/model-00003-of-00007.safetensors b/metallama3_8b/limo/checkpoint-820/model-00003-of-00007.safetensors deleted file mode 100644 index 3c78ff462c3d2d4920dc05bd82e84105a5cffb09..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a38e196666618f9089797166c0ce95006a1615871c1396cd78d6f818ffa5dc6 -size 4999813112 diff --git a/metallama3_8b/limo/checkpoint-820/model-00004-of-00007.safetensors b/metallama3_8b/limo/checkpoint-820/model-00004-of-00007.safetensors deleted file mode 100644 index 2148dba882e67b70c572bed4c2b91c190ca40a17..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de557139e9a80e23fff19b8ed4a8eb493d7cc8ea46b91aaaff2894d5937854c2 -size 4999813128 diff --git a/metallama3_8b/limo/checkpoint-820/model-00005-of-00007.safetensors b/metallama3_8b/limo/checkpoint-820/model-00005-of-00007.safetensors deleted file mode 100644 index edfce315c67c2c47b12ba90287fc467c5a8d7f8d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:08a1adc91aa97f772858d723e2708558628e56d2e8320a12065d03396e542d80 -size 4832007496 diff --git a/metallama3_8b/limo/checkpoint-820/model-00006-of-00007.safetensors b/metallama3_8b/limo/checkpoint-820/model-00006-of-00007.safetensors deleted file mode 100644 index 1c14c7c6b6939da1cb7e8ac301ee46ea75751fc1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ed887f03286cb2d3092e7a3c490d1ffff7a32cb6cb1f934c4c91a54168f44d4 -size 4999813120 diff --git a/metallama3_8b/limo/checkpoint-820/model-00007-of-00007.safetensors b/metallama3_8b/limo/checkpoint-820/model-00007-of-00007.safetensors deleted file mode 100644 index 308a768314d09bf37b920f14562ce76db552abfb..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:016dfd3b2af2941b84fb38d0132e96436f6cccd11b30db547f287748b3dfab17 -size 2571158184 diff --git a/metallama3_8b/limo/checkpoint-820/model.safetensors.index.json b/metallama3_8b/limo/checkpoint-820/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/checkpoint-820/rng_state_0.pth b/metallama3_8b/limo/checkpoint-820/rng_state_0.pth deleted file mode 100644 index f8799407442db08820f995bcf1b9158f696af19f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70cc56408014c410353d4dd58ae9b03f4be043f5f800324f66fd8e20e99b840e -size 15024 diff --git a/metallama3_8b/limo/checkpoint-820/rng_state_1.pth b/metallama3_8b/limo/checkpoint-820/rng_state_1.pth deleted file mode 100644 index aa0c3c6aeaabc038c714a3fcc9b78d186a4cab59..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49d1438e98cc9c53a6852464635ce62e9788e61eb3646b73e33813f487c4b6ae -size 15024 diff --git a/metallama3_8b/limo/checkpoint-820/rng_state_2.pth b/metallama3_8b/limo/checkpoint-820/rng_state_2.pth deleted file mode 100644 index 0f39416636e7990907141a415603582d33812fc9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4388add9cec90932f8ff0100d27a0574d98e1bad52ff89d44e31967d2b4fbfde -size 15024 diff --git a/metallama3_8b/limo/checkpoint-820/rng_state_3.pth b/metallama3_8b/limo/checkpoint-820/rng_state_3.pth deleted file mode 100644 index d3775bcd497f8ad74ece6675e0bbda89fb7ee6f4..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a705d6dfaae4f2c1b4b2be6b25a6eb521ffae6fcba21cc1531e97b60037ed079 -size 15024 diff --git a/metallama3_8b/limo/checkpoint-820/scheduler.pt b/metallama3_8b/limo/checkpoint-820/scheduler.pt deleted file mode 100644 index 7f14a1364eb462170a7912373ae13ef8661b3efe..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75dfbfd6ef3bbd63df7dc3a93c3a4aff67a923e03d810df00d3977b92bd90070 -size 1064 diff --git a/metallama3_8b/limo/checkpoint-820/special_tokens_map.json b/metallama3_8b/limo/checkpoint-820/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/checkpoint-820/tokenizer.json b/metallama3_8b/limo/checkpoint-820/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/checkpoint-820/tokenizer_config.json b/metallama3_8b/limo/checkpoint-820/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/checkpoint-820/trainer_state.json b/metallama3_8b/limo/checkpoint-820/trainer_state.json deleted file mode 100644 index 394efb16d1dcbcc8ecb765a424f47595b9e583d6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/checkpoint-820/trainer_state.json +++ /dev/null @@ -1,5774 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 4.0, - "eval_steps": 500, - "global_step": 820, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - }, - { - "epoch": 3.004878048780488, - "grad_norm": 2.787429094314575, - "learning_rate": 3.969463130731183e-06, - "loss": 0.2403, - "step": 616 - }, - { - "epoch": 3.0097560975609756, - "grad_norm": 3.0412075519561768, - "learning_rate": 3.966361891975316e-06, - "loss": 0.2635, - "step": 617 - }, - { - "epoch": 3.0146341463414634, - "grad_norm": 2.9949851036071777, - "learning_rate": 3.963257209457652e-06, - "loss": 0.3294, - "step": 618 - }, - { - "epoch": 3.0195121951219512, - "grad_norm": 3.0510809421539307, - "learning_rate": 3.960149090469561e-06, - "loss": 0.1338, - "step": 619 - }, - { - "epoch": 3.024390243902439, - "grad_norm": 3.669482707977295, - "learning_rate": 3.957037542310484e-06, - "loss": 0.1469, - "step": 620 - }, - { - "epoch": 3.029268292682927, - "grad_norm": 4.677116870880127, - "learning_rate": 3.953922572287915e-06, - "loss": 0.2788, - "step": 621 - }, - { - "epoch": 3.0341463414634147, - "grad_norm": 4.33144474029541, - "learning_rate": 3.950804187717384e-06, - "loss": 0.4521, - "step": 622 - }, - { - "epoch": 3.0390243902439025, - "grad_norm": 3.466639757156372, - "learning_rate": 3.947682395922439e-06, - "loss": 0.5113, - "step": 623 - }, - { - "epoch": 3.0439024390243903, - "grad_norm": 3.2332122325897217, - "learning_rate": 3.9445572042346346e-06, - "loss": 0.0968, - "step": 624 - }, - { - "epoch": 3.048780487804878, - "grad_norm": 2.6108055114746094, - "learning_rate": 3.941428619993505e-06, - "loss": 0.2462, - "step": 625 - }, - { - "epoch": 3.053658536585366, - "grad_norm": 3.2512595653533936, - "learning_rate": 3.938296650546552e-06, - "loss": 0.1782, - "step": 626 - }, - { - "epoch": 3.0585365853658537, - "grad_norm": 3.4350366592407227, - "learning_rate": 3.935161303249231e-06, - "loss": 0.2955, - "step": 627 - }, - { - "epoch": 3.0634146341463415, - "grad_norm": 3.42012619972229, - "learning_rate": 3.932022585464928e-06, - "loss": 0.3259, - "step": 628 - }, - { - "epoch": 3.0682926829268293, - "grad_norm": 3.458043336868286, - "learning_rate": 3.928880504564943e-06, - "loss": 0.2306, - "step": 629 - }, - { - "epoch": 3.073170731707317, - "grad_norm": 2.646616220474243, - "learning_rate": 3.92573506792848e-06, - "loss": 0.2197, - "step": 630 - }, - { - "epoch": 3.078048780487805, - "grad_norm": 3.5558857917785645, - "learning_rate": 3.9225862829426184e-06, - "loss": 0.1607, - "step": 631 - }, - { - "epoch": 3.0829268292682928, - "grad_norm": 3.6011338233947754, - "learning_rate": 3.919434157002303e-06, - "loss": 0.3087, - "step": 632 - }, - { - "epoch": 3.0878048780487806, - "grad_norm": 2.339879035949707, - "learning_rate": 3.916278697510325e-06, - "loss": 0.2213, - "step": 633 - }, - { - "epoch": 3.0926829268292684, - "grad_norm": 3.268162488937378, - "learning_rate": 3.913119911877305e-06, - "loss": 0.318, - "step": 634 - }, - { - "epoch": 3.097560975609756, - "grad_norm": 4.062571048736572, - "learning_rate": 3.909957807521674e-06, - "loss": 0.1757, - "step": 635 - }, - { - "epoch": 3.102439024390244, - "grad_norm": 2.997659683227539, - "learning_rate": 3.906792391869657e-06, - "loss": 0.2391, - "step": 636 - }, - { - "epoch": 3.107317073170732, - "grad_norm": 3.7037394046783447, - "learning_rate": 3.903623672355258e-06, - "loss": 0.2548, - "step": 637 - }, - { - "epoch": 3.1121951219512196, - "grad_norm": 3.110579252243042, - "learning_rate": 3.900451656420237e-06, - "loss": 0.2389, - "step": 638 - }, - { - "epoch": 3.1170731707317074, - "grad_norm": 3.3332321643829346, - "learning_rate": 3.897276351514097e-06, - "loss": 0.1371, - "step": 639 - }, - { - "epoch": 3.1219512195121952, - "grad_norm": 3.8275935649871826, - "learning_rate": 3.894097765094065e-06, - "loss": 0.3363, - "step": 640 - }, - { - "epoch": 3.126829268292683, - "grad_norm": 2.3731374740600586, - "learning_rate": 3.890915904625075e-06, - "loss": 0.1314, - "step": 641 - }, - { - "epoch": 3.131707317073171, - "grad_norm": 3.1511282920837402, - "learning_rate": 3.887730777579751e-06, - "loss": 0.3563, - "step": 642 - }, - { - "epoch": 3.1365853658536587, - "grad_norm": 4.2254862785339355, - "learning_rate": 3.884542391438387e-06, - "loss": 0.5053, - "step": 643 - }, - { - "epoch": 3.1414634146341465, - "grad_norm": 4.579670429229736, - "learning_rate": 3.88135075368893e-06, - "loss": 0.6259, - "step": 644 - }, - { - "epoch": 3.1463414634146343, - "grad_norm": 3.2102746963500977, - "learning_rate": 3.878155871826968e-06, - "loss": 0.2599, - "step": 645 - }, - { - "epoch": 3.151219512195122, - "grad_norm": 2.5569686889648438, - "learning_rate": 3.874957753355701e-06, - "loss": 0.2075, - "step": 646 - }, - { - "epoch": 3.15609756097561, - "grad_norm": 3.588925838470459, - "learning_rate": 3.8717564057859365e-06, - "loss": 0.4577, - "step": 647 - }, - { - "epoch": 3.1609756097560977, - "grad_norm": 3.6163878440856934, - "learning_rate": 3.868551836636063e-06, - "loss": 0.4023, - "step": 648 - }, - { - "epoch": 3.1658536585365855, - "grad_norm": 3.8688390254974365, - "learning_rate": 3.865344053432035e-06, - "loss": 0.1669, - "step": 649 - }, - { - "epoch": 3.1707317073170733, - "grad_norm": 3.419734001159668, - "learning_rate": 3.862133063707353e-06, - "loss": 0.2766, - "step": 650 - }, - { - "epoch": 3.175609756097561, - "grad_norm": 2.9860243797302246, - "learning_rate": 3.858918875003053e-06, - "loss": 0.1788, - "step": 651 - }, - { - "epoch": 3.180487804878049, - "grad_norm": 3.0619022846221924, - "learning_rate": 3.855701494867679e-06, - "loss": 0.224, - "step": 652 - }, - { - "epoch": 3.1853658536585368, - "grad_norm": 3.3668978214263916, - "learning_rate": 3.852480930857275e-06, - "loss": 0.4029, - "step": 653 - }, - { - "epoch": 3.1902439024390246, - "grad_norm": 3.543147563934326, - "learning_rate": 3.849257190535356e-06, - "loss": 0.2096, - "step": 654 - }, - { - "epoch": 3.1951219512195124, - "grad_norm": 3.793619155883789, - "learning_rate": 3.846030281472902e-06, - "loss": 0.5574, - "step": 655 - }, - { - "epoch": 3.2, - "grad_norm": 3.021289110183716, - "learning_rate": 3.842800211248333e-06, - "loss": 0.2233, - "step": 656 - }, - { - "epoch": 3.204878048780488, - "grad_norm": 4.582934856414795, - "learning_rate": 3.839566987447492e-06, - "loss": 0.3871, - "step": 657 - }, - { - "epoch": 3.209756097560976, - "grad_norm": 2.996340274810791, - "learning_rate": 3.8363306176636296e-06, - "loss": 0.4325, - "step": 658 - }, - { - "epoch": 3.2146341463414636, - "grad_norm": 3.3190877437591553, - "learning_rate": 3.833091109497384e-06, - "loss": 0.5321, - "step": 659 - }, - { - "epoch": 3.2195121951219514, - "grad_norm": 3.2532856464385986, - "learning_rate": 3.829848470556765e-06, - "loss": 0.1359, - "step": 660 - }, - { - "epoch": 3.2243902439024392, - "grad_norm": 2.7875044345855713, - "learning_rate": 3.8266027084571335e-06, - "loss": 0.3145, - "step": 661 - }, - { - "epoch": 3.229268292682927, - "grad_norm": 3.748253583908081, - "learning_rate": 3.823353830821187e-06, - "loss": 0.1252, - "step": 662 - }, - { - "epoch": 3.234146341463415, - "grad_norm": 2.858293294906616, - "learning_rate": 3.820101845278937e-06, - "loss": 0.2589, - "step": 663 - }, - { - "epoch": 3.2390243902439027, - "grad_norm": 3.7470967769622803, - "learning_rate": 3.816846759467696e-06, - "loss": 0.2594, - "step": 664 - }, - { - "epoch": 3.2439024390243905, - "grad_norm": 3.676196813583374, - "learning_rate": 3.8135885810320587e-06, - "loss": 0.2998, - "step": 665 - }, - { - "epoch": 3.2487804878048783, - "grad_norm": 3.0943140983581543, - "learning_rate": 3.810327317623881e-06, - "loss": 0.2238, - "step": 666 - }, - { - "epoch": 3.253658536585366, - "grad_norm": 3.5907349586486816, - "learning_rate": 3.8070629769022628e-06, - "loss": 0.3381, - "step": 667 - }, - { - "epoch": 3.258536585365854, - "grad_norm": 3.1195285320281982, - "learning_rate": 3.8037955665335335e-06, - "loss": 0.2407, - "step": 668 - }, - { - "epoch": 3.2634146341463417, - "grad_norm": 3.422292947769165, - "learning_rate": 3.800525094191231e-06, - "loss": 0.2957, - "step": 669 - }, - { - "epoch": 3.2682926829268295, - "grad_norm": 2.5264663696289062, - "learning_rate": 3.797251567556083e-06, - "loss": 0.2493, - "step": 670 - }, - { - "epoch": 3.2731707317073173, - "grad_norm": 3.350219964981079, - "learning_rate": 3.793974994315991e-06, - "loss": 0.1186, - "step": 671 - }, - { - "epoch": 3.278048780487805, - "grad_norm": 4.175906181335449, - "learning_rate": 3.790695382166013e-06, - "loss": 0.3453, - "step": 672 - }, - { - "epoch": 3.2829268292682925, - "grad_norm": 3.006072521209717, - "learning_rate": 3.7874127388083415e-06, - "loss": 0.1981, - "step": 673 - }, - { - "epoch": 3.2878048780487803, - "grad_norm": 3.368561029434204, - "learning_rate": 3.7841270719522895e-06, - "loss": 0.2934, - "step": 674 - }, - { - "epoch": 3.292682926829268, - "grad_norm": 4.374331951141357, - "learning_rate": 3.7808383893142692e-06, - "loss": 0.1359, - "step": 675 - }, - { - "epoch": 3.297560975609756, - "grad_norm": 3.297102451324463, - "learning_rate": 3.7775466986177763e-06, - "loss": 0.2498, - "step": 676 - }, - { - "epoch": 3.3024390243902437, - "grad_norm": 2.8914761543273926, - "learning_rate": 3.774252007593371e-06, - "loss": 0.1308, - "step": 677 - }, - { - "epoch": 3.3073170731707315, - "grad_norm": 3.1550722122192383, - "learning_rate": 3.7709543239786593e-06, - "loss": 0.3915, - "step": 678 - }, - { - "epoch": 3.3121951219512193, - "grad_norm": 3.2302658557891846, - "learning_rate": 3.767653655518277e-06, - "loss": 0.2558, - "step": 679 - }, - { - "epoch": 3.317073170731707, - "grad_norm": 4.4321770668029785, - "learning_rate": 3.7643500099638673e-06, - "loss": 0.1988, - "step": 680 - }, - { - "epoch": 3.321951219512195, - "grad_norm": 2.970566749572754, - "learning_rate": 3.7610433950740667e-06, - "loss": 0.4908, - "step": 681 - }, - { - "epoch": 3.3268292682926828, - "grad_norm": 3.5516228675842285, - "learning_rate": 3.757733818614485e-06, - "loss": 0.304, - "step": 682 - }, - { - "epoch": 3.3317073170731706, - "grad_norm": 2.7555387020111084, - "learning_rate": 3.7544212883576856e-06, - "loss": 0.2533, - "step": 683 - }, - { - "epoch": 3.3365853658536584, - "grad_norm": 3.61226749420166, - "learning_rate": 3.751105812083172e-06, - "loss": 0.1771, - "step": 684 - }, - { - "epoch": 3.341463414634146, - "grad_norm": 3.0466206073760986, - "learning_rate": 3.7477873975773655e-06, - "loss": 0.4213, - "step": 685 - }, - { - "epoch": 3.346341463414634, - "grad_norm": 3.6091527938842773, - "learning_rate": 3.7444660526335853e-06, - "loss": 0.3808, - "step": 686 - }, - { - "epoch": 3.351219512195122, - "grad_norm": 3.8443002700805664, - "learning_rate": 3.741141785052036e-06, - "loss": 0.6438, - "step": 687 - }, - { - "epoch": 3.3560975609756096, - "grad_norm": 3.845909833908081, - "learning_rate": 3.737814602639784e-06, - "loss": 0.3686, - "step": 688 - }, - { - "epoch": 3.3609756097560974, - "grad_norm": 2.904892921447754, - "learning_rate": 3.7344845132107427e-06, - "loss": 0.2934, - "step": 689 - }, - { - "epoch": 3.3658536585365852, - "grad_norm": 3.4766387939453125, - "learning_rate": 3.731151524585651e-06, - "loss": 0.3299, - "step": 690 - }, - { - "epoch": 3.370731707317073, - "grad_norm": 4.236767768859863, - "learning_rate": 3.7278156445920584e-06, - "loss": 0.6303, - "step": 691 - }, - { - "epoch": 3.375609756097561, - "grad_norm": 3.1122591495513916, - "learning_rate": 3.724476881064303e-06, - "loss": 0.2432, - "step": 692 - }, - { - "epoch": 3.3804878048780487, - "grad_norm": 3.0971457958221436, - "learning_rate": 3.721135241843496e-06, - "loss": 0.3131, - "step": 693 - }, - { - "epoch": 3.3853658536585365, - "grad_norm": 3.9365804195404053, - "learning_rate": 3.7177907347775016e-06, - "loss": 0.3372, - "step": 694 - }, - { - "epoch": 3.3902439024390243, - "grad_norm": 3.760373115539551, - "learning_rate": 3.71444336772092e-06, - "loss": 0.5055, - "step": 695 - }, - { - "epoch": 3.395121951219512, - "grad_norm": 4.360848426818848, - "learning_rate": 3.711093148535068e-06, - "loss": 0.6183, - "step": 696 - }, - { - "epoch": 3.4, - "grad_norm": 3.7713537216186523, - "learning_rate": 3.707740085087959e-06, - "loss": 0.1568, - "step": 697 - }, - { - "epoch": 3.4048780487804877, - "grad_norm": 3.8532230854034424, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.2826, - "step": 698 - }, - { - "epoch": 3.4097560975609755, - "grad_norm": 3.0548605918884277, - "learning_rate": 3.701025456915411e-06, - "loss": 0.1918, - "step": 699 - }, - { - "epoch": 3.4146341463414633, - "grad_norm": 3.2431821823120117, - "learning_rate": 3.697663907959327e-06, - "loss": 0.2493, - "step": 700 - }, - { - "epoch": 3.419512195121951, - "grad_norm": 3.7301864624023438, - "learning_rate": 3.6942995462806574e-06, - "loss": 0.4913, - "step": 701 - }, - { - "epoch": 3.424390243902439, - "grad_norm": 2.5468900203704834, - "learning_rate": 3.6909323797806314e-06, - "loss": 0.1788, - "step": 702 - }, - { - "epoch": 3.4292682926829268, - "grad_norm": 3.3719515800476074, - "learning_rate": 3.6875624163670635e-06, - "loss": 0.4162, - "step": 703 - }, - { - "epoch": 3.4341463414634146, - "grad_norm": 3.528010368347168, - "learning_rate": 3.6841896639543394e-06, - "loss": 0.1924, - "step": 704 - }, - { - "epoch": 3.4390243902439024, - "grad_norm": 3.3636631965637207, - "learning_rate": 3.6808141304633924e-06, - "loss": 0.3177, - "step": 705 - }, - { - "epoch": 3.44390243902439, - "grad_norm": 3.418705463409424, - "learning_rate": 3.6774358238216878e-06, - "loss": 0.2301, - "step": 706 - }, - { - "epoch": 3.448780487804878, - "grad_norm": 4.720373630523682, - "learning_rate": 3.6740547519632048e-06, - "loss": 0.1894, - "step": 707 - }, - { - "epoch": 3.453658536585366, - "grad_norm": 2.9635703563690186, - "learning_rate": 3.670670922828414e-06, - "loss": 0.2642, - "step": 708 - }, - { - "epoch": 3.4585365853658536, - "grad_norm": 4.934754371643066, - "learning_rate": 3.667284344364264e-06, - "loss": 0.2275, - "step": 709 - }, - { - "epoch": 3.4634146341463414, - "grad_norm": 3.090585231781006, - "learning_rate": 3.6638950245241604e-06, - "loss": 0.4447, - "step": 710 - }, - { - "epoch": 3.4682926829268292, - "grad_norm": 4.360495090484619, - "learning_rate": 3.660502971267945e-06, - "loss": 0.2415, - "step": 711 - }, - { - "epoch": 3.473170731707317, - "grad_norm": 3.4893476963043213, - "learning_rate": 3.65710819256188e-06, - "loss": 0.0921, - "step": 712 - }, - { - "epoch": 3.478048780487805, - "grad_norm": 3.2423770427703857, - "learning_rate": 3.65371069637863e-06, - "loss": 0.2371, - "step": 713 - }, - { - "epoch": 3.4829268292682927, - "grad_norm": 3.0775890350341797, - "learning_rate": 3.650310490697238e-06, - "loss": 0.4026, - "step": 714 - }, - { - "epoch": 3.4878048780487805, - "grad_norm": 3.906625270843506, - "learning_rate": 3.646907583503114e-06, - "loss": 0.4312, - "step": 715 - }, - { - "epoch": 3.4926829268292683, - "grad_norm": 3.2140414714813232, - "learning_rate": 3.6435019827880093e-06, - "loss": 0.2309, - "step": 716 - }, - { - "epoch": 3.497560975609756, - "grad_norm": 3.048523426055908, - "learning_rate": 3.640093696550003e-06, - "loss": 0.296, - "step": 717 - }, - { - "epoch": 3.502439024390244, - "grad_norm": 2.9669039249420166, - "learning_rate": 3.6366827327934817e-06, - "loss": 0.2723, - "step": 718 - }, - { - "epoch": 3.5073170731707317, - "grad_norm": 3.6941726207733154, - "learning_rate": 3.6332690995291176e-06, - "loss": 0.3797, - "step": 719 - }, - { - "epoch": 3.5121951219512195, - "grad_norm": 5.135766506195068, - "learning_rate": 3.6298528047738545e-06, - "loss": 0.9868, - "step": 720 - }, - { - "epoch": 3.5170731707317073, - "grad_norm": 3.2021052837371826, - "learning_rate": 3.626433856550886e-06, - "loss": 0.4069, - "step": 721 - }, - { - "epoch": 3.521951219512195, - "grad_norm": 3.094444513320923, - "learning_rate": 3.623012262889637e-06, - "loss": 0.3368, - "step": 722 - }, - { - "epoch": 3.526829268292683, - "grad_norm": 3.609285354614258, - "learning_rate": 3.6195880318257465e-06, - "loss": 0.3972, - "step": 723 - }, - { - "epoch": 3.5317073170731708, - "grad_norm": 4.236501216888428, - "learning_rate": 3.616161171401046e-06, - "loss": 0.52, - "step": 724 - }, - { - "epoch": 3.5365853658536586, - "grad_norm": 3.504526376724243, - "learning_rate": 3.612731689663542e-06, - "loss": 0.23, - "step": 725 - }, - { - "epoch": 3.5414634146341464, - "grad_norm": 3.233591079711914, - "learning_rate": 3.6092995946673996e-06, - "loss": 0.4151, - "step": 726 - }, - { - "epoch": 3.546341463414634, - "grad_norm": 3.6701886653900146, - "learning_rate": 3.605864894472918e-06, - "loss": 0.2798, - "step": 727 - }, - { - "epoch": 3.551219512195122, - "grad_norm": 3.8713181018829346, - "learning_rate": 3.602427597146516e-06, - "loss": 0.4336, - "step": 728 - }, - { - "epoch": 3.55609756097561, - "grad_norm": 5.49612283706665, - "learning_rate": 3.5989877107607134e-06, - "loss": 0.4803, - "step": 729 - }, - { - "epoch": 3.5609756097560976, - "grad_norm": 3.771005392074585, - "learning_rate": 3.5955452433941075e-06, - "loss": 0.3698, - "step": 730 - }, - { - "epoch": 3.5658536585365854, - "grad_norm": 2.970822334289551, - "learning_rate": 3.5921002031313586e-06, - "loss": 0.2373, - "step": 731 - }, - { - "epoch": 3.5707317073170732, - "grad_norm": 3.517249584197998, - "learning_rate": 3.58865259806317e-06, - "loss": 0.1908, - "step": 732 - }, - { - "epoch": 3.575609756097561, - "grad_norm": 3.6825428009033203, - "learning_rate": 3.585202436286267e-06, - "loss": 0.3993, - "step": 733 - }, - { - "epoch": 3.580487804878049, - "grad_norm": 3.387479066848755, - "learning_rate": 3.581749725903381e-06, - "loss": 0.4237, - "step": 734 - }, - { - "epoch": 3.5853658536585367, - "grad_norm": 3.5004806518554688, - "learning_rate": 3.5782944750232274e-06, - "loss": 0.3011, - "step": 735 - }, - { - "epoch": 3.5902439024390245, - "grad_norm": 3.461731433868408, - "learning_rate": 3.574836691760489e-06, - "loss": 0.0896, - "step": 736 - }, - { - "epoch": 3.5951219512195123, - "grad_norm": 3.9598381519317627, - "learning_rate": 3.571376384235795e-06, - "loss": 0.2751, - "step": 737 - }, - { - "epoch": 3.6, - "grad_norm": 4.053933143615723, - "learning_rate": 3.5679135605757035e-06, - "loss": 0.2086, - "step": 738 - }, - { - "epoch": 3.604878048780488, - "grad_norm": 2.9683544635772705, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1659, - "step": 739 - }, - { - "epoch": 3.6097560975609757, - "grad_norm": 3.6598448753356934, - "learning_rate": 3.5609803973850877e-06, - "loss": 0.2469, - "step": 740 - }, - { - "epoch": 3.6146341463414635, - "grad_norm": 3.449335813522339, - "learning_rate": 3.557510074137147e-06, - "loss": 0.375, - "step": 741 - }, - { - "epoch": 3.6195121951219513, - "grad_norm": 2.7666923999786377, - "learning_rate": 3.554037267318942e-06, - "loss": 0.3133, - "step": 742 - }, - { - "epoch": 3.624390243902439, - "grad_norm": 2.8951869010925293, - "learning_rate": 3.5505619850863847e-06, - "loss": 0.2243, - "step": 743 - }, - { - "epoch": 3.629268292682927, - "grad_norm": 3.477747678756714, - "learning_rate": 3.5470842356012007e-06, - "loss": 0.1321, - "step": 744 - }, - { - "epoch": 3.6341463414634148, - "grad_norm": 3.810480833053589, - "learning_rate": 3.5436040270309113e-06, - "loss": 0.361, - "step": 745 - }, - { - "epoch": 3.6390243902439026, - "grad_norm": 3.0730793476104736, - "learning_rate": 3.540121367548811e-06, - "loss": 0.1523, - "step": 746 - }, - { - "epoch": 3.6439024390243904, - "grad_norm": 3.6878390312194824, - "learning_rate": 3.5366362653339524e-06, - "loss": 0.4898, - "step": 747 - }, - { - "epoch": 3.648780487804878, - "grad_norm": 3.6432242393493652, - "learning_rate": 3.533148728571124e-06, - "loss": 0.1397, - "step": 748 - }, - { - "epoch": 3.653658536585366, - "grad_norm": 3.7047760486602783, - "learning_rate": 3.5296587654508317e-06, - "loss": 0.323, - "step": 749 - }, - { - "epoch": 3.658536585365854, - "grad_norm": 3.777132749557495, - "learning_rate": 3.526166384169279e-06, - "loss": 0.5577, - "step": 750 - }, - { - "epoch": 3.6634146341463416, - "grad_norm": 3.7970924377441406, - "learning_rate": 3.5226715929283507e-06, - "loss": 0.245, - "step": 751 - }, - { - "epoch": 3.6682926829268294, - "grad_norm": 2.8203537464141846, - "learning_rate": 3.519174399935588e-06, - "loss": 0.1619, - "step": 752 - }, - { - "epoch": 3.6731707317073172, - "grad_norm": 3.4040987491607666, - "learning_rate": 3.5156748134041767e-06, - "loss": 0.1047, - "step": 753 - }, - { - "epoch": 3.678048780487805, - "grad_norm": 3.927960157394409, - "learning_rate": 3.5121728415529203e-06, - "loss": 0.5713, - "step": 754 - }, - { - "epoch": 3.682926829268293, - "grad_norm": 3.3833277225494385, - "learning_rate": 3.5086684926062266e-06, - "loss": 0.2174, - "step": 755 - }, - { - "epoch": 3.68780487804878, - "grad_norm": 3.989307403564453, - "learning_rate": 3.505161774794085e-06, - "loss": 0.285, - "step": 756 - }, - { - "epoch": 3.692682926829268, - "grad_norm": 2.742429494857788, - "learning_rate": 3.5016526963520474e-06, - "loss": 0.1602, - "step": 757 - }, - { - "epoch": 3.697560975609756, - "grad_norm": 3.7082698345184326, - "learning_rate": 3.498141265521212e-06, - "loss": 0.666, - "step": 758 - }, - { - "epoch": 3.7024390243902436, - "grad_norm": 3.033196210861206, - "learning_rate": 3.4946274905481997e-06, - "loss": 0.2024, - "step": 759 - }, - { - "epoch": 3.7073170731707314, - "grad_norm": 3.7145371437072754, - "learning_rate": 3.4911113796851364e-06, - "loss": 0.2719, - "step": 760 - }, - { - "epoch": 3.7121951219512193, - "grad_norm": 3.580298900604248, - "learning_rate": 3.487592941189636e-06, - "loss": 0.1537, - "step": 761 - }, - { - "epoch": 3.717073170731707, - "grad_norm": 4.753757953643799, - "learning_rate": 3.484072183324776e-06, - "loss": 0.6149, - "step": 762 - }, - { - "epoch": 3.721951219512195, - "grad_norm": 3.5575687885284424, - "learning_rate": 3.4805491143590823e-06, - "loss": 0.4241, - "step": 763 - }, - { - "epoch": 3.7268292682926827, - "grad_norm": 3.215224266052246, - "learning_rate": 3.4770237425665103e-06, - "loss": 0.3037, - "step": 764 - }, - { - "epoch": 3.7317073170731705, - "grad_norm": 2.9899685382843018, - "learning_rate": 3.4734960762264204e-06, - "loss": 0.4854, - "step": 765 - }, - { - "epoch": 3.7365853658536583, - "grad_norm": 3.5880227088928223, - "learning_rate": 3.469966123623563e-06, - "loss": 0.3849, - "step": 766 - }, - { - "epoch": 3.741463414634146, - "grad_norm": 3.472750186920166, - "learning_rate": 3.46643389304806e-06, - "loss": 0.3159, - "step": 767 - }, - { - "epoch": 3.746341463414634, - "grad_norm": 4.355650901794434, - "learning_rate": 3.4628993927953786e-06, - "loss": 0.7527, - "step": 768 - }, - { - "epoch": 3.7512195121951217, - "grad_norm": 2.94575834274292, - "learning_rate": 3.45936263116632e-06, - "loss": 0.1716, - "step": 769 - }, - { - "epoch": 3.7560975609756095, - "grad_norm": 2.991525173187256, - "learning_rate": 3.4558236164669957e-06, - "loss": 0.2061, - "step": 770 - }, - { - "epoch": 3.7609756097560973, - "grad_norm": 3.134000301361084, - "learning_rate": 3.4522823570088073e-06, - "loss": 0.1338, - "step": 771 - }, - { - "epoch": 3.765853658536585, - "grad_norm": 3.722140312194824, - "learning_rate": 3.4487388611084295e-06, - "loss": 0.2615, - "step": 772 - }, - { - "epoch": 3.770731707317073, - "grad_norm": 3.7941153049468994, - "learning_rate": 3.445193137087788e-06, - "loss": 0.1401, - "step": 773 - }, - { - "epoch": 3.7756097560975608, - "grad_norm": 2.872941732406616, - "learning_rate": 3.4416451932740424e-06, - "loss": 0.2934, - "step": 774 - }, - { - "epoch": 3.7804878048780486, - "grad_norm": 4.5019941329956055, - "learning_rate": 3.4380950379995652e-06, - "loss": 0.4579, - "step": 775 - }, - { - "epoch": 3.7853658536585364, - "grad_norm": 2.682884931564331, - "learning_rate": 3.434542679601922e-06, - "loss": 0.2979, - "step": 776 - }, - { - "epoch": 3.790243902439024, - "grad_norm": 3.3044273853302, - "learning_rate": 3.4309881264238538e-06, - "loss": 0.1196, - "step": 777 - }, - { - "epoch": 3.795121951219512, - "grad_norm": 3.102760076522827, - "learning_rate": 3.4274313868132547e-06, - "loss": 0.2026, - "step": 778 - }, - { - "epoch": 3.8, - "grad_norm": 3.3304500579833984, - "learning_rate": 3.4238724691231534e-06, - "loss": 0.2135, - "step": 779 - }, - { - "epoch": 3.8048780487804876, - "grad_norm": 3.295119047164917, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.4418, - "step": 780 - }, - { - "epoch": 3.8097560975609754, - "grad_norm": 3.6655640602111816, - "learning_rate": 3.4167481329421204e-06, - "loss": 0.203, - "step": 781 - }, - { - "epoch": 3.8146341463414632, - "grad_norm": 3.387830972671509, - "learning_rate": 3.4131827311827447e-06, - "loss": 0.3225, - "step": 782 - }, - { - "epoch": 3.819512195121951, - "grad_norm": 2.621633529663086, - "learning_rate": 3.4096151848069416e-06, - "loss": 0.1704, - "step": 783 - }, - { - "epoch": 3.824390243902439, - "grad_norm": 2.974344491958618, - "learning_rate": 3.4060455021931195e-06, - "loss": 0.2785, - "step": 784 - }, - { - "epoch": 3.8292682926829267, - "grad_norm": 3.452131748199463, - "learning_rate": 3.402473691724704e-06, - "loss": 0.223, - "step": 785 - }, - { - "epoch": 3.8341463414634145, - "grad_norm": 2.6373705863952637, - "learning_rate": 3.39889976179012e-06, - "loss": 0.2368, - "step": 786 - }, - { - "epoch": 3.8390243902439023, - "grad_norm": 2.863184928894043, - "learning_rate": 3.3953237207827673e-06, - "loss": 0.3294, - "step": 787 - }, - { - "epoch": 3.84390243902439, - "grad_norm": 5.104704856872559, - "learning_rate": 3.391745577101005e-06, - "loss": 0.5431, - "step": 788 - }, - { - "epoch": 3.848780487804878, - "grad_norm": 3.951310634613037, - "learning_rate": 3.3881653391481306e-06, - "loss": 0.2546, - "step": 789 - }, - { - "epoch": 3.8536585365853657, - "grad_norm": 3.9903225898742676, - "learning_rate": 3.384583015332359e-06, - "loss": 0.3293, - "step": 790 - }, - { - "epoch": 3.8585365853658535, - "grad_norm": 3.3149220943450928, - "learning_rate": 3.380998614066805e-06, - "loss": 0.1861, - "step": 791 - }, - { - "epoch": 3.8634146341463413, - "grad_norm": 3.6755223274230957, - "learning_rate": 3.3774121437694606e-06, - "loss": 0.2498, - "step": 792 - }, - { - "epoch": 3.868292682926829, - "grad_norm": 3.192918300628662, - "learning_rate": 3.3738236128631786e-06, - "loss": 0.1525, - "step": 793 - }, - { - "epoch": 3.873170731707317, - "grad_norm": 3.5358777046203613, - "learning_rate": 3.3702330297756503e-06, - "loss": 0.3622, - "step": 794 - }, - { - "epoch": 3.8780487804878048, - "grad_norm": 3.619878053665161, - "learning_rate": 3.366640402939387e-06, - "loss": 0.1051, - "step": 795 - }, - { - "epoch": 3.8829268292682926, - "grad_norm": 7.085352420806885, - "learning_rate": 3.363045740791698e-06, - "loss": 0.4606, - "step": 796 - }, - { - "epoch": 3.8878048780487804, - "grad_norm": 2.523165464401245, - "learning_rate": 3.3594490517746774e-06, - "loss": 0.2267, - "step": 797 - }, - { - "epoch": 3.892682926829268, - "grad_norm": 2.7026922702789307, - "learning_rate": 3.3558503443351733e-06, - "loss": 0.2792, - "step": 798 - }, - { - "epoch": 3.897560975609756, - "grad_norm": 2.9232428073883057, - "learning_rate": 3.352249626924777e-06, - "loss": 0.2579, - "step": 799 - }, - { - "epoch": 3.902439024390244, - "grad_norm": 4.760788440704346, - "learning_rate": 3.348646907999801e-06, - "loss": 0.6983, - "step": 800 - }, - { - "epoch": 3.9073170731707316, - "grad_norm": 3.198249578475952, - "learning_rate": 3.345042196021257e-06, - "loss": 0.3265, - "step": 801 - }, - { - "epoch": 3.9121951219512194, - "grad_norm": 4.069286823272705, - "learning_rate": 3.3414354994548385e-06, - "loss": 0.497, - "step": 802 - }, - { - "epoch": 3.9170731707317072, - "grad_norm": 3.4435410499572754, - "learning_rate": 3.337826826770898e-06, - "loss": 0.2812, - "step": 803 - }, - { - "epoch": 3.921951219512195, - "grad_norm": 3.9805212020874023, - "learning_rate": 3.3342161864444312e-06, - "loss": 0.2277, - "step": 804 - }, - { - "epoch": 3.926829268292683, - "grad_norm": 3.348925828933716, - "learning_rate": 3.3306035869550534e-06, - "loss": 0.1614, - "step": 805 - }, - { - "epoch": 3.9317073170731707, - "grad_norm": 4.7613701820373535, - "learning_rate": 3.326989036786981e-06, - "loss": 0.3269, - "step": 806 - }, - { - "epoch": 3.9365853658536585, - "grad_norm": 3.807502508163452, - "learning_rate": 3.3233725444290126e-06, - "loss": 0.2619, - "step": 807 - }, - { - "epoch": 3.9414634146341463, - "grad_norm": 3.2690203189849854, - "learning_rate": 3.3197541183745065e-06, - "loss": 0.4334, - "step": 808 - }, - { - "epoch": 3.946341463414634, - "grad_norm": 3.396993398666382, - "learning_rate": 3.3161337671213634e-06, - "loss": 0.2738, - "step": 809 - }, - { - "epoch": 3.951219512195122, - "grad_norm": 3.086669921875, - "learning_rate": 3.312511499172006e-06, - "loss": 0.1597, - "step": 810 - }, - { - "epoch": 3.9560975609756097, - "grad_norm": 3.5688745975494385, - "learning_rate": 3.3088873230333562e-06, - "loss": 0.3195, - "step": 811 - }, - { - "epoch": 3.9609756097560975, - "grad_norm": 3.4843621253967285, - "learning_rate": 3.3052612472168193e-06, - "loss": 0.1865, - "step": 812 - }, - { - "epoch": 3.9658536585365853, - "grad_norm": 2.8479580879211426, - "learning_rate": 3.3016332802382618e-06, - "loss": 0.3108, - "step": 813 - }, - { - "epoch": 3.970731707317073, - "grad_norm": 3.3241543769836426, - "learning_rate": 3.2980034306179897e-06, - "loss": 0.2099, - "step": 814 - }, - { - "epoch": 3.975609756097561, - "grad_norm": 2.817675828933716, - "learning_rate": 3.294371706880733e-06, - "loss": 0.3073, - "step": 815 - }, - { - "epoch": 3.9804878048780488, - "grad_norm": 2.9535388946533203, - "learning_rate": 3.290738117555622e-06, - "loss": 0.2024, - "step": 816 - }, - { - "epoch": 3.9853658536585366, - "grad_norm": 5.021281719207764, - "learning_rate": 3.2871026711761666e-06, - "loss": 0.508, - "step": 817 - }, - { - "epoch": 3.9902439024390244, - "grad_norm": 3.3377649784088135, - "learning_rate": 3.2834653762802414e-06, - "loss": 0.2116, - "step": 818 - }, - { - "epoch": 3.995121951219512, - "grad_norm": 4.412073135375977, - "learning_rate": 3.2798262414100594e-06, - "loss": 0.2177, - "step": 819 - }, - { - "epoch": 4.0, - "grad_norm": 3.174323797225952, - "learning_rate": 3.2761852751121566e-06, - "loss": 0.1737, - "step": 820 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 2.3594383853800653e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/config.json b/metallama3_8b/limo/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo/generation_config.json b/metallama3_8b/limo/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo/model-00001-of-00007.safetensors b/metallama3_8b/limo/model-00001-of-00007.safetensors deleted file mode 100644 index 4d5a84d922df222c932eac589c15ea3357de75fe..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:225412b1113ad13202cf4125509a242705ca9fc3d2662195a561bd8e48fd4db8 -size 4886466168 diff --git a/metallama3_8b/limo/model-00002-of-00007.safetensors b/metallama3_8b/limo/model-00002-of-00007.safetensors deleted file mode 100644 index 1c192eeae2946f3f0c35b0a90bf16005c749f19d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14e92e96e88dfee47e5a50580659bc951b35026bf2889984fa38d1923c957533 -size 4832007448 diff --git a/metallama3_8b/limo/model-00003-of-00007.safetensors b/metallama3_8b/limo/model-00003-of-00007.safetensors deleted file mode 100644 index f8e19a4384cdbd25ddec25cf62d2429514cccbd3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00617c2642eaec9556936462f59084becdfd958740be2fff056bc36201ae2c12 -size 4999813112 diff --git a/metallama3_8b/limo/model-00004-of-00007.safetensors b/metallama3_8b/limo/model-00004-of-00007.safetensors deleted file mode 100644 index 61e62a8f5bafd115a90a0dba23a535050307bde6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec16549b532ec841a3b942995c41e00a57fa5ca1c365037d63ac498736687686 -size 4999813128 diff --git a/metallama3_8b/limo/model-00005-of-00007.safetensors b/metallama3_8b/limo/model-00005-of-00007.safetensors deleted file mode 100644 index a98f8be8e21c31b18c87631a894ac368a70b96e8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1cc4772012f82ad7210046779c85e46d634c9fd6cd53880731c9606da82dd82 -size 4832007496 diff --git a/metallama3_8b/limo/model-00006-of-00007.safetensors b/metallama3_8b/limo/model-00006-of-00007.safetensors deleted file mode 100644 index 0a8be7e6a5f85506f886978c5d062ad47b841315..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:03bf8eebafad01810b57fec7d7c54ca9b4142b99cff668b5b42b9e0bba965f8a -size 4999813120 diff --git a/metallama3_8b/limo/model-00007-of-00007.safetensors b/metallama3_8b/limo/model-00007-of-00007.safetensors deleted file mode 100644 index 3cfb796f8767c5183f765a9507214741bb12412d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bec445108b3bdb10d010181fb6e7030de70365a0c4e5f34cabb912e3b4830e60 -size 2571158184 diff --git a/metallama3_8b/limo/model.safetensors.index.json b/metallama3_8b/limo/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo/special_tokens_map.json b/metallama3_8b/limo/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo/tokenizer.json b/metallama3_8b/limo/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo/tokenizer_config.json b/metallama3_8b/limo/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo/train_results.json b/metallama3_8b/limo/train_results.json deleted file mode 100644 index e90c777551753b9357fb0a16c66657ba945482d0..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/train_results.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "epoch": 10.0, - "total_flos": 5.892331269877924e+17, - "train_loss": 0.2632100873960966, - "train_runtime": 9760.8256, - "train_samples_per_second": 0.837, - "train_steps_per_second": 0.21 -} \ No newline at end of file diff --git a/metallama3_8b/limo/trainer_log.jsonl b/metallama3_8b/limo/trainer_log.jsonl deleted file mode 100644 index 967814d4101844eb428747c1d0299b6eb27f6d37..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/trainer_log.jsonl +++ /dev/null @@ -1,2051 +0,0 @@ -{"current_steps": 1, "total_steps": 2050, "loss": 1.4179, "lr": 5e-06, "epoch": 0.004878048780487805, "percentage": 0.05, "elapsed_time": "0:00:05", "remaining_time": "2:59:44"} -{"current_steps": 2, "total_steps": 2050, "loss": 1.1405, "lr": 4.999997064365715e-06, "epoch": 0.00975609756097561, "percentage": 0.1, "elapsed_time": "0:00:06", "remaining_time": "1:55:51"} -{"current_steps": 3, "total_steps": 2050, "loss": 0.8682, "lr": 4.999988257469751e-06, "epoch": 0.014634146341463415, "percentage": 0.15, "elapsed_time": "0:00:10", "remaining_time": "1:53:57"} -{"current_steps": 4, "total_steps": 2050, "loss": 0.9961, "lr": 4.999973579332793e-06, "epoch": 0.01951219512195122, "percentage": 0.2, "elapsed_time": "0:00:11", "remaining_time": "1:41:32"} -{"current_steps": 5, "total_steps": 2050, "loss": 1.0173, "lr": 4.999953029989312e-06, "epoch": 0.024390243902439025, "percentage": 0.24, "elapsed_time": "0:00:15", "remaining_time": "1:47:47"} -{"current_steps": 6, "total_steps": 2050, "loss": 1.1083, "lr": 4.999926609487568e-06, "epoch": 0.02926829268292683, "percentage": 0.29, "elapsed_time": "0:00:16", "remaining_time": "1:35:10"} -{"current_steps": 7, "total_steps": 2050, "loss": 1.1109, "lr": 4.9998943178896106e-06, "epoch": 0.03414634146341464, "percentage": 0.34, "elapsed_time": "0:00:18", "remaining_time": "1:31:34"} -{"current_steps": 8, "total_steps": 2050, "loss": 1.821, "lr": 4.999856155271276e-06, "epoch": 0.03902439024390244, "percentage": 0.39, "elapsed_time": "0:00:24", "remaining_time": "1:44:32"} -{"current_steps": 9, "total_steps": 2050, "loss": 1.0417, "lr": 4.999812121722191e-06, "epoch": 0.04390243902439024, "percentage": 0.44, "elapsed_time": "0:00:28", "remaining_time": "1:47:08"} -{"current_steps": 10, "total_steps": 2050, "loss": 1.5672, "lr": 4.999762217345766e-06, "epoch": 0.04878048780487805, "percentage": 0.49, "elapsed_time": "0:00:35", "remaining_time": "2:00:57"} -{"current_steps": 11, "total_steps": 2050, "loss": 0.7297, "lr": 4.999706442259205e-06, "epoch": 0.05365853658536585, "percentage": 0.54, "elapsed_time": "0:00:38", "remaining_time": "1:58:21"} -{"current_steps": 12, "total_steps": 2050, "loss": 0.9112, "lr": 4.999644796593492e-06, "epoch": 0.05853658536585366, "percentage": 0.59, "elapsed_time": "0:00:40", "remaining_time": "1:54:14"} -{"current_steps": 13, "total_steps": 2050, "loss": 0.7854, "lr": 4.999577280493407e-06, "epoch": 0.06341463414634146, "percentage": 0.63, "elapsed_time": "0:00:43", "remaining_time": "1:54:13"} -{"current_steps": 14, "total_steps": 2050, "loss": 1.1317, "lr": 4.99950389411751e-06, "epoch": 0.06829268292682927, "percentage": 0.68, "elapsed_time": "0:00:48", "remaining_time": "1:56:22"} -{"current_steps": 15, "total_steps": 2050, "loss": 0.7864, "lr": 4.999424637638148e-06, "epoch": 0.07317073170731707, "percentage": 0.73, "elapsed_time": "0:00:49", "remaining_time": "1:50:50"} -{"current_steps": 16, "total_steps": 2050, "loss": 0.8494, "lr": 4.999339511241458e-06, "epoch": 0.07804878048780488, "percentage": 0.78, "elapsed_time": "0:00:50", "remaining_time": "1:47:54"} -{"current_steps": 17, "total_steps": 2050, "loss": 1.2189, "lr": 4.9992485151273584e-06, "epoch": 0.08292682926829269, "percentage": 0.83, "elapsed_time": "0:00:57", "remaining_time": "1:53:47"} -{"current_steps": 18, "total_steps": 2050, "loss": 1.0532, "lr": 4.999151649509554e-06, "epoch": 0.08780487804878048, "percentage": 0.88, "elapsed_time": "0:01:00", "remaining_time": "1:53:00"} -{"current_steps": 19, "total_steps": 2050, "loss": 1.088, "lr": 4.9990489146155356e-06, "epoch": 0.09268292682926829, "percentage": 0.93, "elapsed_time": "0:01:02", "remaining_time": "1:52:06"} -{"current_steps": 20, "total_steps": 2050, "loss": 1.0414, "lr": 4.9989403106865765e-06, "epoch": 0.0975609756097561, "percentage": 0.98, "elapsed_time": "0:01:07", "remaining_time": "1:54:15"} -{"current_steps": 21, "total_steps": 2050, "loss": 0.8878, "lr": 4.9988258379777334e-06, "epoch": 0.1024390243902439, "percentage": 1.02, "elapsed_time": "0:01:08", "remaining_time": "1:51:04"} -{"current_steps": 22, "total_steps": 2050, "loss": 0.9151, "lr": 4.998705496757846e-06, "epoch": 0.1073170731707317, "percentage": 1.07, "elapsed_time": "0:01:11", "remaining_time": "1:49:16"} -{"current_steps": 23, "total_steps": 2050, "loss": 1.4304, "lr": 4.998579287309538e-06, "epoch": 0.11219512195121951, "percentage": 1.12, "elapsed_time": "0:01:14", "remaining_time": "1:49:32"} -{"current_steps": 24, "total_steps": 2050, "loss": 1.0858, "lr": 4.998447209929211e-06, "epoch": 0.11707317073170732, "percentage": 1.17, "elapsed_time": "0:01:17", "remaining_time": "1:49:32"} -{"current_steps": 25, "total_steps": 2050, "loss": 0.6571, "lr": 4.998309264927053e-06, "epoch": 0.12195121951219512, "percentage": 1.22, "elapsed_time": "0:01:19", "remaining_time": "1:46:41"} -{"current_steps": 26, "total_steps": 2050, "loss": 0.8493, "lr": 4.998165452627025e-06, "epoch": 0.12682926829268293, "percentage": 1.27, "elapsed_time": "0:01:20", "remaining_time": "1:44:25"} -{"current_steps": 27, "total_steps": 2050, "loss": 0.9224, "lr": 4.998015773366874e-06, "epoch": 0.13170731707317074, "percentage": 1.32, "elapsed_time": "0:01:25", "remaining_time": "1:46:44"} -{"current_steps": 28, "total_steps": 2050, "loss": 0.7588, "lr": 4.997860227498122e-06, "epoch": 0.13658536585365855, "percentage": 1.37, "elapsed_time": "0:01:26", "remaining_time": "1:44:39"} -{"current_steps": 29, "total_steps": 2050, "loss": 1.1817, "lr": 4.99769881538607e-06, "epoch": 0.14146341463414633, "percentage": 1.41, "elapsed_time": "0:01:33", "remaining_time": "1:48:42"} -{"current_steps": 30, "total_steps": 2050, "loss": 1.0737, "lr": 4.997531537409794e-06, "epoch": 0.14634146341463414, "percentage": 1.46, "elapsed_time": "0:01:39", "remaining_time": "1:51:37"} -{"current_steps": 31, "total_steps": 2050, "loss": 0.7899, "lr": 4.99735839396215e-06, "epoch": 0.15121951219512195, "percentage": 1.51, "elapsed_time": "0:01:41", "remaining_time": "1:50:23"} -{"current_steps": 32, "total_steps": 2050, "loss": 0.7745, "lr": 4.9971793854497655e-06, "epoch": 0.15609756097560976, "percentage": 1.56, "elapsed_time": "0:01:44", "remaining_time": "1:49:31"} -{"current_steps": 33, "total_steps": 2050, "loss": 0.984, "lr": 4.996994512293042e-06, "epoch": 0.16097560975609757, "percentage": 1.61, "elapsed_time": "0:01:49", "remaining_time": "1:51:34"} -{"current_steps": 34, "total_steps": 2050, "loss": 0.8235, "lr": 4.996803774926157e-06, "epoch": 0.16585365853658537, "percentage": 1.66, "elapsed_time": "0:01:54", "remaining_time": "1:52:55"} -{"current_steps": 35, "total_steps": 2050, "loss": 1.3227, "lr": 4.996607173797059e-06, "epoch": 0.17073170731707318, "percentage": 1.71, "elapsed_time": "0:01:55", "remaining_time": "1:51:01"} -{"current_steps": 36, "total_steps": 2050, "loss": 0.8854, "lr": 4.996404709367466e-06, "epoch": 0.17560975609756097, "percentage": 1.76, "elapsed_time": "0:02:00", "remaining_time": "1:52:16"} -{"current_steps": 37, "total_steps": 2050, "loss": 0.6786, "lr": 4.996196382112868e-06, "epoch": 0.18048780487804877, "percentage": 1.8, "elapsed_time": "0:02:03", "remaining_time": "1:51:54"} -{"current_steps": 38, "total_steps": 2050, "loss": 0.9344, "lr": 4.9959821925225235e-06, "epoch": 0.18536585365853658, "percentage": 1.85, "elapsed_time": "0:02:06", "remaining_time": "1:51:46"} -{"current_steps": 39, "total_steps": 2050, "loss": 0.814, "lr": 4.995762141099456e-06, "epoch": 0.1902439024390244, "percentage": 1.9, "elapsed_time": "0:02:09", "remaining_time": "1:50:59"} -{"current_steps": 40, "total_steps": 2050, "loss": 1.0276, "lr": 4.995536228360461e-06, "epoch": 0.1951219512195122, "percentage": 1.95, "elapsed_time": "0:02:11", "remaining_time": "1:50:00"} -{"current_steps": 41, "total_steps": 2050, "loss": 0.9291, "lr": 4.995304454836095e-06, "epoch": 0.2, "percentage": 2.0, "elapsed_time": "0:02:13", "remaining_time": "1:48:44"} -{"current_steps": 42, "total_steps": 2050, "loss": 0.8145, "lr": 4.9950668210706795e-06, "epoch": 0.2048780487804878, "percentage": 2.05, "elapsed_time": "0:02:17", "remaining_time": "1:49:42"} -{"current_steps": 43, "total_steps": 2050, "loss": 0.8779, "lr": 4.994823327622299e-06, "epoch": 0.2097560975609756, "percentage": 2.1, "elapsed_time": "0:02:21", "remaining_time": "1:50:02"} -{"current_steps": 44, "total_steps": 2050, "loss": 0.8196, "lr": 4.9945739750628e-06, "epoch": 0.2146341463414634, "percentage": 2.15, "elapsed_time": "0:02:24", "remaining_time": "1:49:40"} -{"current_steps": 45, "total_steps": 2050, "loss": 0.8443, "lr": 4.994318763977789e-06, "epoch": 0.21951219512195122, "percentage": 2.2, "elapsed_time": "0:02:28", "remaining_time": "1:49:55"} -{"current_steps": 46, "total_steps": 2050, "loss": 1.0328, "lr": 4.994057694966632e-06, "epoch": 0.22439024390243903, "percentage": 2.24, "elapsed_time": "0:02:31", "remaining_time": "1:49:50"} -{"current_steps": 47, "total_steps": 2050, "loss": 1.0673, "lr": 4.993790768642449e-06, "epoch": 0.22926829268292684, "percentage": 2.29, "elapsed_time": "0:02:33", "remaining_time": "1:48:45"} -{"current_steps": 48, "total_steps": 2050, "loss": 1.3198, "lr": 4.99351798563212e-06, "epoch": 0.23414634146341465, "percentage": 2.34, "elapsed_time": "0:02:39", "remaining_time": "1:50:59"} -{"current_steps": 49, "total_steps": 2050, "loss": 0.8743, "lr": 4.993239346576278e-06, "epoch": 0.23902439024390243, "percentage": 2.39, "elapsed_time": "0:02:43", "remaining_time": "1:51:03"} -{"current_steps": 50, "total_steps": 2050, "loss": 1.109, "lr": 4.99295485212931e-06, "epoch": 0.24390243902439024, "percentage": 2.44, "elapsed_time": "0:02:47", "remaining_time": "1:51:53"} -{"current_steps": 51, "total_steps": 2050, "loss": 0.9291, "lr": 4.992664502959351e-06, "epoch": 0.24878048780487805, "percentage": 2.49, "elapsed_time": "0:02:53", "remaining_time": "1:53:34"} -{"current_steps": 52, "total_steps": 2050, "loss": 0.8159, "lr": 4.99236829974829e-06, "epoch": 0.25365853658536586, "percentage": 2.54, "elapsed_time": "0:02:54", "remaining_time": "1:52:03"} -{"current_steps": 53, "total_steps": 2050, "loss": 1.0359, "lr": 4.992066243191762e-06, "epoch": 0.25853658536585367, "percentage": 2.59, "elapsed_time": "0:02:57", "remaining_time": "1:51:24"} -{"current_steps": 54, "total_steps": 2050, "loss": 0.8091, "lr": 4.991758333999148e-06, "epoch": 0.2634146341463415, "percentage": 2.63, "elapsed_time": "0:02:59", "remaining_time": "1:50:33"} -{"current_steps": 55, "total_steps": 2050, "loss": 0.6925, "lr": 4.991444572893575e-06, "epoch": 0.2682926829268293, "percentage": 2.68, "elapsed_time": "0:03:02", "remaining_time": "1:50:09"} -{"current_steps": 56, "total_steps": 2050, "loss": 0.6329, "lr": 4.991124960611916e-06, "epoch": 0.2731707317073171, "percentage": 2.73, "elapsed_time": "0:03:03", "remaining_time": "1:49:01"} -{"current_steps": 57, "total_steps": 2050, "loss": 0.8069, "lr": 4.99079949790478e-06, "epoch": 0.2780487804878049, "percentage": 2.78, "elapsed_time": "0:03:08", "remaining_time": "1:49:39"} -{"current_steps": 58, "total_steps": 2050, "loss": 0.8682, "lr": 4.99046818553652e-06, "epoch": 0.28292682926829266, "percentage": 2.83, "elapsed_time": "0:03:12", "remaining_time": "1:49:54"} -{"current_steps": 59, "total_steps": 2050, "loss": 1.1069, "lr": 4.9901310242852246e-06, "epoch": 0.28780487804878047, "percentage": 2.88, "elapsed_time": "0:03:15", "remaining_time": "1:49:51"} -{"current_steps": 60, "total_steps": 2050, "loss": 0.9465, "lr": 4.9897880149427206e-06, "epoch": 0.2926829268292683, "percentage": 2.93, "elapsed_time": "0:03:18", "remaining_time": "1:49:57"} -{"current_steps": 61, "total_steps": 2050, "loss": 0.9738, "lr": 4.989439158314566e-06, "epoch": 0.2975609756097561, "percentage": 2.98, "elapsed_time": "0:03:24", "remaining_time": "1:51:06"} -{"current_steps": 62, "total_steps": 2050, "loss": 0.6417, "lr": 4.989084455220056e-06, "epoch": 0.3024390243902439, "percentage": 3.02, "elapsed_time": "0:03:25", "remaining_time": "1:50:02"} -{"current_steps": 63, "total_steps": 2050, "loss": 1.0092, "lr": 4.988723906492212e-06, "epoch": 0.3073170731707317, "percentage": 3.07, "elapsed_time": "0:03:26", "remaining_time": "1:48:46"} -{"current_steps": 64, "total_steps": 2050, "loss": 0.6691, "lr": 4.988357512977785e-06, "epoch": 0.3121951219512195, "percentage": 3.12, "elapsed_time": "0:03:29", "remaining_time": "1:48:06"} -{"current_steps": 65, "total_steps": 2050, "loss": 0.6651, "lr": 4.987985275537252e-06, "epoch": 0.3170731707317073, "percentage": 3.17, "elapsed_time": "0:03:30", "remaining_time": "1:47:08"} -{"current_steps": 66, "total_steps": 2050, "loss": 0.9227, "lr": 4.9876071950448185e-06, "epoch": 0.32195121951219513, "percentage": 3.22, "elapsed_time": "0:03:33", "remaining_time": "1:47:07"} -{"current_steps": 67, "total_steps": 2050, "loss": 0.6664, "lr": 4.987223272388407e-06, "epoch": 0.32682926829268294, "percentage": 3.27, "elapsed_time": "0:03:37", "remaining_time": "1:47:11"} -{"current_steps": 68, "total_steps": 2050, "loss": 0.997, "lr": 4.986833508469663e-06, "epoch": 0.33170731707317075, "percentage": 3.32, "elapsed_time": "0:03:38", "remaining_time": "1:46:20"} -{"current_steps": 69, "total_steps": 2050, "loss": 0.8551, "lr": 4.98643790420395e-06, "epoch": 0.33658536585365856, "percentage": 3.37, "elapsed_time": "0:03:44", "remaining_time": "1:47:29"} -{"current_steps": 70, "total_steps": 2050, "loss": 0.8874, "lr": 4.986036460520348e-06, "epoch": 0.34146341463414637, "percentage": 3.41, "elapsed_time": "0:03:48", "remaining_time": "1:47:29"} -{"current_steps": 71, "total_steps": 2050, "loss": 1.1393, "lr": 4.98562917836165e-06, "epoch": 0.3463414634146341, "percentage": 3.46, "elapsed_time": "0:03:54", "remaining_time": "1:48:56"} -{"current_steps": 72, "total_steps": 2050, "loss": 0.6379, "lr": 4.985216058684362e-06, "epoch": 0.35121951219512193, "percentage": 3.51, "elapsed_time": "0:03:58", "remaining_time": "1:49:02"} -{"current_steps": 73, "total_steps": 2050, "loss": 1.0292, "lr": 4.984797102458697e-06, "epoch": 0.35609756097560974, "percentage": 3.56, "elapsed_time": "0:04:01", "remaining_time": "1:48:52"} -{"current_steps": 74, "total_steps": 2050, "loss": 0.7048, "lr": 4.984372310668579e-06, "epoch": 0.36097560975609755, "percentage": 3.61, "elapsed_time": "0:04:02", "remaining_time": "1:48:05"} -{"current_steps": 75, "total_steps": 2050, "loss": 1.2353, "lr": 4.983941684311633e-06, "epoch": 0.36585365853658536, "percentage": 3.66, "elapsed_time": "0:04:07", "remaining_time": "1:48:32"} -{"current_steps": 76, "total_steps": 2050, "loss": 0.8933, "lr": 4.983505224399188e-06, "epoch": 0.37073170731707317, "percentage": 3.71, "elapsed_time": "0:04:10", "remaining_time": "1:48:27"} -{"current_steps": 77, "total_steps": 2050, "loss": 0.8221, "lr": 4.983062931956275e-06, "epoch": 0.375609756097561, "percentage": 3.76, "elapsed_time": "0:04:15", "remaining_time": "1:49:15"} -{"current_steps": 78, "total_steps": 2050, "loss": 0.8899, "lr": 4.9826148080216195e-06, "epoch": 0.3804878048780488, "percentage": 3.8, "elapsed_time": "0:04:21", "remaining_time": "1:50:04"} -{"current_steps": 79, "total_steps": 2050, "loss": 1.2451, "lr": 4.9821608536476445e-06, "epoch": 0.3853658536585366, "percentage": 3.85, "elapsed_time": "0:04:23", "remaining_time": "1:49:43"} -{"current_steps": 80, "total_steps": 2050, "loss": 0.8536, "lr": 4.981701069900465e-06, "epoch": 0.3902439024390244, "percentage": 3.9, "elapsed_time": "0:04:27", "remaining_time": "1:49:38"} -{"current_steps": 81, "total_steps": 2050, "loss": 0.7857, "lr": 4.9812354578598876e-06, "epoch": 0.3951219512195122, "percentage": 3.95, "elapsed_time": "0:04:30", "remaining_time": "1:49:27"} -{"current_steps": 82, "total_steps": 2050, "loss": 0.8332, "lr": 4.980764018619405e-06, "epoch": 0.4, "percentage": 4.0, "elapsed_time": "0:04:32", "remaining_time": "1:48:58"} -{"current_steps": 83, "total_steps": 2050, "loss": 0.9927, "lr": 4.980286753286196e-06, "epoch": 0.40487804878048783, "percentage": 4.05, "elapsed_time": "0:04:35", "remaining_time": "1:48:54"} -{"current_steps": 84, "total_steps": 2050, "loss": 0.8161, "lr": 4.97980366298112e-06, "epoch": 0.4097560975609756, "percentage": 4.1, "elapsed_time": "0:04:41", "remaining_time": "1:49:40"} -{"current_steps": 85, "total_steps": 2050, "loss": 0.8017, "lr": 4.97931474883872e-06, "epoch": 0.4146341463414634, "percentage": 4.15, "elapsed_time": "0:04:45", "remaining_time": "1:50:09"} -{"current_steps": 86, "total_steps": 2050, "loss": 0.8811, "lr": 4.978820012007213e-06, "epoch": 0.4195121951219512, "percentage": 4.2, "elapsed_time": "0:04:50", "remaining_time": "1:50:27"} -{"current_steps": 87, "total_steps": 2050, "loss": 0.9461, "lr": 4.978319453648495e-06, "epoch": 0.424390243902439, "percentage": 4.24, "elapsed_time": "0:04:54", "remaining_time": "1:50:46"} -{"current_steps": 88, "total_steps": 2050, "loss": 0.8835, "lr": 4.977813074938128e-06, "epoch": 0.4292682926829268, "percentage": 4.29, "elapsed_time": "0:05:00", "remaining_time": "1:51:44"} -{"current_steps": 89, "total_steps": 2050, "loss": 0.8466, "lr": 4.977300877065347e-06, "epoch": 0.43414634146341463, "percentage": 4.34, "elapsed_time": "0:05:07", "remaining_time": "1:52:48"} -{"current_steps": 90, "total_steps": 2050, "loss": 0.7132, "lr": 4.976782861233053e-06, "epoch": 0.43902439024390244, "percentage": 4.39, "elapsed_time": "0:05:12", "remaining_time": "1:53:32"} -{"current_steps": 91, "total_steps": 2050, "loss": 0.7639, "lr": 4.976259028657812e-06, "epoch": 0.44390243902439025, "percentage": 4.44, "elapsed_time": "0:05:20", "remaining_time": "1:54:51"} -{"current_steps": 92, "total_steps": 2050, "loss": 0.8055, "lr": 4.975729380569845e-06, "epoch": 0.44878048780487806, "percentage": 4.49, "elapsed_time": "0:05:25", "remaining_time": "1:55:27"} -{"current_steps": 93, "total_steps": 2050, "loss": 0.6042, "lr": 4.975193918213035e-06, "epoch": 0.45365853658536587, "percentage": 4.54, "elapsed_time": "0:05:26", "remaining_time": "1:54:28"} -{"current_steps": 94, "total_steps": 2050, "loss": 0.7672, "lr": 4.974652642844921e-06, "epoch": 0.4585365853658537, "percentage": 4.59, "elapsed_time": "0:05:28", "remaining_time": "1:54:00"} -{"current_steps": 95, "total_steps": 2050, "loss": 1.0682, "lr": 4.974105555736693e-06, "epoch": 0.4634146341463415, "percentage": 4.63, "elapsed_time": "0:05:32", "remaining_time": "1:54:01"} -{"current_steps": 96, "total_steps": 2050, "loss": 0.7841, "lr": 4.973552658173186e-06, "epoch": 0.4682926829268293, "percentage": 4.68, "elapsed_time": "0:05:33", "remaining_time": "1:53:15"} -{"current_steps": 97, "total_steps": 2050, "loss": 0.8851, "lr": 4.972993951452887e-06, "epoch": 0.47317073170731705, "percentage": 4.73, "elapsed_time": "0:05:37", "remaining_time": "1:53:11"} -{"current_steps": 98, "total_steps": 2050, "loss": 0.9059, "lr": 4.9724294368879214e-06, "epoch": 0.47804878048780486, "percentage": 4.78, "elapsed_time": "0:05:40", "remaining_time": "1:53:04"} -{"current_steps": 99, "total_steps": 2050, "loss": 1.0152, "lr": 4.971859115804055e-06, "epoch": 0.48292682926829267, "percentage": 4.83, "elapsed_time": "0:05:43", "remaining_time": "1:52:56"} -{"current_steps": 100, "total_steps": 2050, "loss": 0.8092, "lr": 4.9712829895406935e-06, "epoch": 0.4878048780487805, "percentage": 4.88, "elapsed_time": "0:05:47", "remaining_time": "1:53:03"} -{"current_steps": 101, "total_steps": 2050, "loss": 0.8239, "lr": 4.970701059450872e-06, "epoch": 0.4926829268292683, "percentage": 4.93, "elapsed_time": "0:05:51", "remaining_time": "1:52:55"} -{"current_steps": 102, "total_steps": 2050, "loss": 0.9283, "lr": 4.970113326901258e-06, "epoch": 0.4975609756097561, "percentage": 4.98, "elapsed_time": "0:05:54", "remaining_time": "1:52:43"} -{"current_steps": 103, "total_steps": 2050, "loss": 0.9429, "lr": 4.9695197932721455e-06, "epoch": 0.5024390243902439, "percentage": 5.02, "elapsed_time": "0:05:58", "remaining_time": "1:52:47"} -{"current_steps": 104, "total_steps": 2050, "loss": 0.9231, "lr": 4.968920459957453e-06, "epoch": 0.5073170731707317, "percentage": 5.07, "elapsed_time": "0:06:04", "remaining_time": "1:53:39"} -{"current_steps": 105, "total_steps": 2050, "loss": 1.0005, "lr": 4.968315328364719e-06, "epoch": 0.5121951219512195, "percentage": 5.12, "elapsed_time": "0:06:07", "remaining_time": "1:53:23"} -{"current_steps": 106, "total_steps": 2050, "loss": 1.1326, "lr": 4.9677043999151e-06, "epoch": 0.5170731707317073, "percentage": 5.17, "elapsed_time": "0:06:09", "remaining_time": "1:52:59"} -{"current_steps": 107, "total_steps": 2050, "loss": 0.541, "lr": 4.967087676043366e-06, "epoch": 0.5219512195121951, "percentage": 5.22, "elapsed_time": "0:06:12", "remaining_time": "1:52:52"} -{"current_steps": 108, "total_steps": 2050, "loss": 0.9473, "lr": 4.966465158197897e-06, "epoch": 0.526829268292683, "percentage": 5.27, "elapsed_time": "0:06:19", "remaining_time": "1:53:49"} -{"current_steps": 109, "total_steps": 2050, "loss": 0.6678, "lr": 4.965836847840681e-06, "epoch": 0.5317073170731708, "percentage": 5.32, "elapsed_time": "0:06:23", "remaining_time": "1:53:50"} -{"current_steps": 110, "total_steps": 2050, "loss": 0.9251, "lr": 4.96520274644731e-06, "epoch": 0.5365853658536586, "percentage": 5.37, "elapsed_time": "0:06:29", "remaining_time": "1:54:25"} -{"current_steps": 111, "total_steps": 2050, "loss": 0.7807, "lr": 4.964562855506976e-06, "epoch": 0.5414634146341464, "percentage": 5.41, "elapsed_time": "0:06:30", "remaining_time": "1:53:49"} -{"current_steps": 112, "total_steps": 2050, "loss": 0.6395, "lr": 4.963917176522466e-06, "epoch": 0.5463414634146342, "percentage": 5.46, "elapsed_time": "0:06:33", "remaining_time": "1:53:35"} -{"current_steps": 113, "total_steps": 2050, "loss": 1.0658, "lr": 4.963265711010164e-06, "epoch": 0.551219512195122, "percentage": 5.51, "elapsed_time": "0:06:37", "remaining_time": "1:53:28"} -{"current_steps": 114, "total_steps": 2050, "loss": 0.8974, "lr": 4.9626084605000395e-06, "epoch": 0.5560975609756098, "percentage": 5.56, "elapsed_time": "0:06:44", "remaining_time": "1:54:23"} -{"current_steps": 115, "total_steps": 2050, "loss": 0.6144, "lr": 4.961945426535652e-06, "epoch": 0.5609756097560976, "percentage": 5.61, "elapsed_time": "0:06:46", "remaining_time": "1:53:58"} -{"current_steps": 116, "total_steps": 2050, "loss": 0.9083, "lr": 4.961276610674141e-06, "epoch": 0.5658536585365853, "percentage": 5.66, "elapsed_time": "0:06:52", "remaining_time": "1:54:38"} -{"current_steps": 117, "total_steps": 2050, "loss": 1.0101, "lr": 4.960602014486225e-06, "epoch": 0.5707317073170731, "percentage": 5.71, "elapsed_time": "0:06:55", "remaining_time": "1:54:21"} -{"current_steps": 118, "total_steps": 2050, "loss": 0.8391, "lr": 4.959921639556199e-06, "epoch": 0.5756097560975609, "percentage": 5.76, "elapsed_time": "0:07:01", "remaining_time": "1:55:04"} -{"current_steps": 119, "total_steps": 2050, "loss": 1.0431, "lr": 4.959235487481928e-06, "epoch": 0.5804878048780487, "percentage": 5.8, "elapsed_time": "0:07:03", "remaining_time": "1:54:36"} -{"current_steps": 120, "total_steps": 2050, "loss": 0.5864, "lr": 4.958543559874846e-06, "epoch": 0.5853658536585366, "percentage": 5.85, "elapsed_time": "0:07:06", "remaining_time": "1:54:24"} -{"current_steps": 121, "total_steps": 2050, "loss": 0.7868, "lr": 4.9578458583599495e-06, "epoch": 0.5902439024390244, "percentage": 5.9, "elapsed_time": "0:07:11", "remaining_time": "1:54:44"} -{"current_steps": 122, "total_steps": 2050, "loss": 0.7901, "lr": 4.957142384575795e-06, "epoch": 0.5951219512195122, "percentage": 5.95, "elapsed_time": "0:07:15", "remaining_time": "1:54:35"} -{"current_steps": 123, "total_steps": 2050, "loss": 0.9067, "lr": 4.956433140174498e-06, "epoch": 0.6, "percentage": 6.0, "elapsed_time": "0:07:16", "remaining_time": "1:54:03"} -{"current_steps": 124, "total_steps": 2050, "loss": 0.8971, "lr": 4.9557181268217225e-06, "epoch": 0.6048780487804878, "percentage": 6.05, "elapsed_time": "0:07:18", "remaining_time": "1:53:33"} -{"current_steps": 125, "total_steps": 2050, "loss": 1.2123, "lr": 4.954997346196683e-06, "epoch": 0.6097560975609756, "percentage": 6.1, "elapsed_time": "0:07:22", "remaining_time": "1:53:31"} -{"current_steps": 126, "total_steps": 2050, "loss": 0.7696, "lr": 4.954270799992138e-06, "epoch": 0.6146341463414634, "percentage": 6.15, "elapsed_time": "0:07:24", "remaining_time": "1:53:02"} -{"current_steps": 127, "total_steps": 2050, "loss": 0.7919, "lr": 4.953538489914387e-06, "epoch": 0.6195121951219512, "percentage": 6.2, "elapsed_time": "0:07:25", "remaining_time": "1:52:31"} -{"current_steps": 128, "total_steps": 2050, "loss": 0.6494, "lr": 4.9528004176832654e-06, "epoch": 0.624390243902439, "percentage": 6.24, "elapsed_time": "0:07:29", "remaining_time": "1:52:24"} -{"current_steps": 129, "total_steps": 2050, "loss": 1.0546, "lr": 4.952056585032142e-06, "epoch": 0.6292682926829268, "percentage": 6.29, "elapsed_time": "0:07:31", "remaining_time": "1:51:59"} -{"current_steps": 130, "total_steps": 2050, "loss": 0.7907, "lr": 4.951306993707913e-06, "epoch": 0.6341463414634146, "percentage": 6.34, "elapsed_time": "0:07:33", "remaining_time": "1:51:36"} -{"current_steps": 131, "total_steps": 2050, "loss": 0.7433, "lr": 4.950551645470998e-06, "epoch": 0.6390243902439025, "percentage": 6.39, "elapsed_time": "0:07:36", "remaining_time": "1:51:33"} -{"current_steps": 132, "total_steps": 2050, "loss": 0.7682, "lr": 4.9497905420953406e-06, "epoch": 0.6439024390243903, "percentage": 6.44, "elapsed_time": "0:07:38", "remaining_time": "1:50:58"} -{"current_steps": 133, "total_steps": 2050, "loss": 0.8411, "lr": 4.949023685368395e-06, "epoch": 0.6487804878048781, "percentage": 6.49, "elapsed_time": "0:07:39", "remaining_time": "1:50:22"} -{"current_steps": 134, "total_steps": 2050, "loss": 1.0792, "lr": 4.948251077091131e-06, "epoch": 0.6536585365853659, "percentage": 6.54, "elapsed_time": "0:07:42", "remaining_time": "1:50:09"} -{"current_steps": 135, "total_steps": 2050, "loss": 0.8033, "lr": 4.947472719078025e-06, "epoch": 0.6585365853658537, "percentage": 6.59, "elapsed_time": "0:07:45", "remaining_time": "1:49:59"} -{"current_steps": 136, "total_steps": 2050, "loss": 0.939, "lr": 4.9466886131570565e-06, "epoch": 0.6634146341463415, "percentage": 6.63, "elapsed_time": "0:07:47", "remaining_time": "1:49:46"} -{"current_steps": 137, "total_steps": 2050, "loss": 1.0418, "lr": 4.945898761169704e-06, "epoch": 0.6682926829268293, "percentage": 6.68, "elapsed_time": "0:07:51", "remaining_time": "1:49:42"} -{"current_steps": 138, "total_steps": 2050, "loss": 0.6158, "lr": 4.945103164970941e-06, "epoch": 0.6731707317073171, "percentage": 6.73, "elapsed_time": "0:07:55", "remaining_time": "1:49:49"} -{"current_steps": 139, "total_steps": 2050, "loss": 0.6995, "lr": 4.9443018264292304e-06, "epoch": 0.6780487804878049, "percentage": 6.78, "elapsed_time": "0:07:59", "remaining_time": "1:49:53"} -{"current_steps": 140, "total_steps": 2050, "loss": 1.0382, "lr": 4.9434947474265225e-06, "epoch": 0.6829268292682927, "percentage": 6.83, "elapsed_time": "0:08:05", "remaining_time": "1:50:21"} -{"current_steps": 141, "total_steps": 2050, "loss": 1.037, "lr": 4.942681929858249e-06, "epoch": 0.6878048780487804, "percentage": 6.88, "elapsed_time": "0:08:07", "remaining_time": "1:50:02"} -{"current_steps": 142, "total_steps": 2050, "loss": 0.9071, "lr": 4.941863375633315e-06, "epoch": 0.6926829268292682, "percentage": 6.93, "elapsed_time": "0:08:09", "remaining_time": "1:49:40"} -{"current_steps": 143, "total_steps": 2050, "loss": 0.7908, "lr": 4.9410390866741056e-06, "epoch": 0.697560975609756, "percentage": 6.98, "elapsed_time": "0:08:13", "remaining_time": "1:49:43"} -{"current_steps": 144, "total_steps": 2050, "loss": 0.7739, "lr": 4.9402090649164655e-06, "epoch": 0.7024390243902439, "percentage": 7.02, "elapsed_time": "0:08:20", "remaining_time": "1:50:20"} -{"current_steps": 145, "total_steps": 2050, "loss": 0.939, "lr": 4.9393733123097085e-06, "epoch": 0.7073170731707317, "percentage": 7.07, "elapsed_time": "0:08:24", "remaining_time": "1:50:22"} -{"current_steps": 146, "total_steps": 2050, "loss": 0.8729, "lr": 4.9385318308166065e-06, "epoch": 0.7121951219512195, "percentage": 7.12, "elapsed_time": "0:08:27", "remaining_time": "1:50:17"} -{"current_steps": 147, "total_steps": 2050, "loss": 0.6124, "lr": 4.937684622413385e-06, "epoch": 0.7170731707317073, "percentage": 7.17, "elapsed_time": "0:08:34", "remaining_time": "1:51:03"} -{"current_steps": 148, "total_steps": 2050, "loss": 0.975, "lr": 4.9368316890897185e-06, "epoch": 0.7219512195121951, "percentage": 7.22, "elapsed_time": "0:08:37", "remaining_time": "1:50:50"} -{"current_steps": 149, "total_steps": 2050, "loss": 0.5832, "lr": 4.9359730328487264e-06, "epoch": 0.7268292682926829, "percentage": 7.27, "elapsed_time": "0:08:39", "remaining_time": "1:50:29"} -{"current_steps": 150, "total_steps": 2050, "loss": 0.8124, "lr": 4.935108655706972e-06, "epoch": 0.7317073170731707, "percentage": 7.32, "elapsed_time": "0:08:44", "remaining_time": "1:50:39"} -{"current_steps": 151, "total_steps": 2050, "loss": 1.1446, "lr": 4.934238559694448e-06, "epoch": 0.7365853658536585, "percentage": 7.37, "elapsed_time": "0:08:46", "remaining_time": "1:50:17"} -{"current_steps": 152, "total_steps": 2050, "loss": 0.7884, "lr": 4.9333627468545845e-06, "epoch": 0.7414634146341463, "percentage": 7.41, "elapsed_time": "0:08:52", "remaining_time": "1:50:52"} -{"current_steps": 153, "total_steps": 2050, "loss": 0.7918, "lr": 4.932481219244231e-06, "epoch": 0.7463414634146341, "percentage": 7.46, "elapsed_time": "0:08:56", "remaining_time": "1:50:46"} -{"current_steps": 154, "total_steps": 2050, "loss": 0.775, "lr": 4.931593978933666e-06, "epoch": 0.751219512195122, "percentage": 7.51, "elapsed_time": "0:08:59", "remaining_time": "1:50:42"} -{"current_steps": 155, "total_steps": 2050, "loss": 0.993, "lr": 4.930701028006577e-06, "epoch": 0.7560975609756098, "percentage": 7.56, "elapsed_time": "0:09:05", "remaining_time": "1:51:14"} -{"current_steps": 156, "total_steps": 2050, "loss": 0.7911, "lr": 4.929802368560066e-06, "epoch": 0.7609756097560976, "percentage": 7.61, "elapsed_time": "0:09:09", "remaining_time": "1:51:07"} -{"current_steps": 157, "total_steps": 2050, "loss": 0.9346, "lr": 4.928898002704642e-06, "epoch": 0.7658536585365854, "percentage": 7.66, "elapsed_time": "0:09:10", "remaining_time": "1:50:42"} -{"current_steps": 158, "total_steps": 2050, "loss": 0.817, "lr": 4.927987932564215e-06, "epoch": 0.7707317073170732, "percentage": 7.71, "elapsed_time": "0:09:11", "remaining_time": "1:50:08"} -{"current_steps": 159, "total_steps": 2050, "loss": 0.7918, "lr": 4.927072160276092e-06, "epoch": 0.775609756097561, "percentage": 7.76, "elapsed_time": "0:09:15", "remaining_time": "1:50:11"} -{"current_steps": 160, "total_steps": 2050, "loss": 0.7153, "lr": 4.926150687990969e-06, "epoch": 0.7804878048780488, "percentage": 7.8, "elapsed_time": "0:09:19", "remaining_time": "1:50:05"} -{"current_steps": 161, "total_steps": 2050, "loss": 0.8982, "lr": 4.925223517872934e-06, "epoch": 0.7853658536585366, "percentage": 7.85, "elapsed_time": "0:09:22", "remaining_time": "1:50:02"} -{"current_steps": 162, "total_steps": 2050, "loss": 0.9839, "lr": 4.9242906520994484e-06, "epoch": 0.7902439024390244, "percentage": 7.9, "elapsed_time": "0:09:25", "remaining_time": "1:49:56"} -{"current_steps": 163, "total_steps": 2050, "loss": 0.8406, "lr": 4.923352092861358e-06, "epoch": 0.7951219512195122, "percentage": 7.95, "elapsed_time": "0:09:28", "remaining_time": "1:49:37"} -{"current_steps": 164, "total_steps": 2050, "loss": 0.6602, "lr": 4.922407842362875e-06, "epoch": 0.8, "percentage": 8.0, "elapsed_time": "0:09:30", "remaining_time": "1:49:26"} -{"current_steps": 165, "total_steps": 2050, "loss": 0.9779, "lr": 4.921457902821578e-06, "epoch": 0.8048780487804879, "percentage": 8.05, "elapsed_time": "0:09:38", "remaining_time": "1:50:07"} -{"current_steps": 166, "total_steps": 2050, "loss": 0.8821, "lr": 4.920502276468408e-06, "epoch": 0.8097560975609757, "percentage": 8.1, "elapsed_time": "0:09:41", "remaining_time": "1:50:01"} -{"current_steps": 167, "total_steps": 2050, "loss": 0.7539, "lr": 4.9195409655476605e-06, "epoch": 0.8146341463414634, "percentage": 8.15, "elapsed_time": "0:09:47", "remaining_time": "1:50:23"} -{"current_steps": 168, "total_steps": 2050, "loss": 0.9807, "lr": 4.918573972316982e-06, "epoch": 0.8195121951219512, "percentage": 8.2, "elapsed_time": "0:09:51", "remaining_time": "1:50:23"} -{"current_steps": 169, "total_steps": 2050, "loss": 0.8318, "lr": 4.917601299047361e-06, "epoch": 0.824390243902439, "percentage": 8.24, "elapsed_time": "0:09:53", "remaining_time": "1:50:01"} -{"current_steps": 170, "total_steps": 2050, "loss": 0.7816, "lr": 4.916622948023129e-06, "epoch": 0.8292682926829268, "percentage": 8.29, "elapsed_time": "0:09:59", "remaining_time": "1:50:24"} -{"current_steps": 171, "total_steps": 2050, "loss": 0.6633, "lr": 4.915638921541952e-06, "epoch": 0.8341463414634146, "percentage": 8.34, "elapsed_time": "0:10:01", "remaining_time": "1:50:04"} -{"current_steps": 172, "total_steps": 2050, "loss": 0.9296, "lr": 4.914649221914822e-06, "epoch": 0.8390243902439024, "percentage": 8.39, "elapsed_time": "0:10:08", "remaining_time": "1:50:43"} -{"current_steps": 173, "total_steps": 2050, "loss": 0.6864, "lr": 4.913653851466057e-06, "epoch": 0.8439024390243902, "percentage": 8.44, "elapsed_time": "0:10:13", "remaining_time": "1:50:54"} -{"current_steps": 174, "total_steps": 2050, "loss": 0.8599, "lr": 4.912652812533291e-06, "epoch": 0.848780487804878, "percentage": 8.49, "elapsed_time": "0:10:16", "remaining_time": "1:50:47"} -{"current_steps": 175, "total_steps": 2050, "loss": 0.8949, "lr": 4.911646107467472e-06, "epoch": 0.8536585365853658, "percentage": 8.54, "elapsed_time": "0:10:19", "remaining_time": "1:50:42"} -{"current_steps": 176, "total_steps": 2050, "loss": 0.9758, "lr": 4.9106337386328524e-06, "epoch": 0.8585365853658536, "percentage": 8.59, "elapsed_time": "0:10:26", "remaining_time": "1:51:08"} -{"current_steps": 177, "total_steps": 2050, "loss": 0.8954, "lr": 4.909615708406991e-06, "epoch": 0.8634146341463415, "percentage": 8.63, "elapsed_time": "0:10:28", "remaining_time": "1:50:51"} -{"current_steps": 178, "total_steps": 2050, "loss": 0.7157, "lr": 4.908592019180738e-06, "epoch": 0.8682926829268293, "percentage": 8.68, "elapsed_time": "0:10:32", "remaining_time": "1:50:55"} -{"current_steps": 179, "total_steps": 2050, "loss": 0.6358, "lr": 4.907562673358234e-06, "epoch": 0.8731707317073171, "percentage": 8.73, "elapsed_time": "0:10:38", "remaining_time": "1:51:14"} -{"current_steps": 180, "total_steps": 2050, "loss": 0.6685, "lr": 4.906527673356907e-06, "epoch": 0.8780487804878049, "percentage": 8.78, "elapsed_time": "0:10:40", "remaining_time": "1:50:55"} -{"current_steps": 181, "total_steps": 2050, "loss": 0.5686, "lr": 4.905487021607462e-06, "epoch": 0.8829268292682927, "percentage": 8.83, "elapsed_time": "0:10:43", "remaining_time": "1:50:43"} -{"current_steps": 182, "total_steps": 2050, "loss": 0.8538, "lr": 4.904440720553876e-06, "epoch": 0.8878048780487805, "percentage": 8.88, "elapsed_time": "0:10:47", "remaining_time": "1:50:48"} -{"current_steps": 183, "total_steps": 2050, "loss": 0.8292, "lr": 4.903388772653396e-06, "epoch": 0.8926829268292683, "percentage": 8.93, "elapsed_time": "0:10:52", "remaining_time": "1:50:59"} -{"current_steps": 184, "total_steps": 2050, "loss": 0.7946, "lr": 4.902331180376529e-06, "epoch": 0.8975609756097561, "percentage": 8.98, "elapsed_time": "0:10:58", "remaining_time": "1:51:13"} -{"current_steps": 185, "total_steps": 2050, "loss": 0.9269, "lr": 4.901267946207038e-06, "epoch": 0.9024390243902439, "percentage": 9.02, "elapsed_time": "0:11:01", "remaining_time": "1:51:06"} -{"current_steps": 186, "total_steps": 2050, "loss": 0.7433, "lr": 4.900199072641937e-06, "epoch": 0.9073170731707317, "percentage": 9.07, "elapsed_time": "0:11:04", "remaining_time": "1:50:58"} -{"current_steps": 187, "total_steps": 2050, "loss": 0.6577, "lr": 4.899124562191484e-06, "epoch": 0.9121951219512195, "percentage": 9.12, "elapsed_time": "0:11:07", "remaining_time": "1:50:50"} -{"current_steps": 188, "total_steps": 2050, "loss": 0.5989, "lr": 4.8980444173791735e-06, "epoch": 0.9170731707317074, "percentage": 9.17, "elapsed_time": "0:11:09", "remaining_time": "1:50:29"} -{"current_steps": 189, "total_steps": 2050, "loss": 0.9364, "lr": 4.896958640741735e-06, "epoch": 0.9219512195121952, "percentage": 9.22, "elapsed_time": "0:11:11", "remaining_time": "1:50:11"} -{"current_steps": 190, "total_steps": 2050, "loss": 1.0328, "lr": 4.895867234829121e-06, "epoch": 0.926829268292683, "percentage": 9.27, "elapsed_time": "0:11:13", "remaining_time": "1:49:57"} -{"current_steps": 191, "total_steps": 2050, "loss": 0.772, "lr": 4.894770202204509e-06, "epoch": 0.9317073170731708, "percentage": 9.32, "elapsed_time": "0:11:15", "remaining_time": "1:49:38"} -{"current_steps": 192, "total_steps": 2050, "loss": 0.8128, "lr": 4.893667545444285e-06, "epoch": 0.9365853658536586, "percentage": 9.37, "elapsed_time": "0:11:19", "remaining_time": "1:49:32"} -{"current_steps": 193, "total_steps": 2050, "loss": 0.7418, "lr": 4.8925592671380495e-06, "epoch": 0.9414634146341463, "percentage": 9.41, "elapsed_time": "0:11:20", "remaining_time": "1:49:09"} -{"current_steps": 194, "total_steps": 2050, "loss": 0.5979, "lr": 4.891445369888601e-06, "epoch": 0.9463414634146341, "percentage": 9.46, "elapsed_time": "0:11:22", "remaining_time": "1:48:53"} -{"current_steps": 195, "total_steps": 2050, "loss": 0.9664, "lr": 4.890325856311936e-06, "epoch": 0.9512195121951219, "percentage": 9.51, "elapsed_time": "0:11:26", "remaining_time": "1:48:47"} -{"current_steps": 196, "total_steps": 2050, "loss": 0.8482, "lr": 4.889200729037241e-06, "epoch": 0.9560975609756097, "percentage": 9.56, "elapsed_time": "0:11:27", "remaining_time": "1:48:24"} -{"current_steps": 197, "total_steps": 2050, "loss": 0.7173, "lr": 4.888069990706884e-06, "epoch": 0.9609756097560975, "percentage": 9.61, "elapsed_time": "0:11:29", "remaining_time": "1:48:09"} -{"current_steps": 198, "total_steps": 2050, "loss": 0.5433, "lr": 4.886933643976414e-06, "epoch": 0.9658536585365853, "percentage": 9.66, "elapsed_time": "0:11:35", "remaining_time": "1:48:29"} -{"current_steps": 199, "total_steps": 2050, "loss": 0.5997, "lr": 4.885791691514548e-06, "epoch": 0.9707317073170731, "percentage": 9.71, "elapsed_time": "0:11:37", "remaining_time": "1:48:06"} -{"current_steps": 200, "total_steps": 2050, "loss": 0.6477, "lr": 4.884644136003172e-06, "epoch": 0.975609756097561, "percentage": 9.76, "elapsed_time": "0:11:41", "remaining_time": "1:48:08"} -{"current_steps": 201, "total_steps": 2050, "loss": 1.3465, "lr": 4.883490980137327e-06, "epoch": 0.9804878048780488, "percentage": 9.8, "elapsed_time": "0:11:44", "remaining_time": "1:47:57"} -{"current_steps": 202, "total_steps": 2050, "loss": 0.7533, "lr": 4.882332226625208e-06, "epoch": 0.9853658536585366, "percentage": 9.85, "elapsed_time": "0:11:48", "remaining_time": "1:47:59"} -{"current_steps": 203, "total_steps": 2050, "loss": 0.8555, "lr": 4.881167878188158e-06, "epoch": 0.9902439024390244, "percentage": 9.9, "elapsed_time": "0:11:49", "remaining_time": "1:47:39"} -{"current_steps": 204, "total_steps": 2050, "loss": 0.7634, "lr": 4.8799979375606565e-06, "epoch": 0.9951219512195122, "percentage": 9.95, "elapsed_time": "0:11:55", "remaining_time": "1:47:53"} -{"current_steps": 205, "total_steps": 2050, "loss": 0.66, "lr": 4.878822407490319e-06, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "0:11:59", "remaining_time": "1:47:59"} -{"current_steps": 206, "total_steps": 2050, "loss": 0.7429, "lr": 4.8776412907378845e-06, "epoch": 1.0048780487804878, "percentage": 10.05, "elapsed_time": "0:16:09", "remaining_time": "2:24:40"} -{"current_steps": 207, "total_steps": 2050, "loss": 0.5735, "lr": 4.876454590077216e-06, "epoch": 1.0097560975609756, "percentage": 10.1, "elapsed_time": "0:16:13", "remaining_time": "2:24:23"} -{"current_steps": 208, "total_steps": 2050, "loss": 0.8065, "lr": 4.875262308295289e-06, "epoch": 1.0146341463414634, "percentage": 10.15, "elapsed_time": "0:16:16", "remaining_time": "2:24:05"} -{"current_steps": 209, "total_steps": 2050, "loss": 0.7148, "lr": 4.874064448192185e-06, "epoch": 1.0195121951219512, "percentage": 10.2, "elapsed_time": "0:16:19", "remaining_time": "2:23:48"} -{"current_steps": 210, "total_steps": 2050, "loss": 0.5606, "lr": 4.872861012581088e-06, "epoch": 1.024390243902439, "percentage": 10.24, "elapsed_time": "0:16:25", "remaining_time": "2:23:57"} -{"current_steps": 211, "total_steps": 2050, "loss": 0.6492, "lr": 4.871652004288275e-06, "epoch": 1.0292682926829269, "percentage": 10.29, "elapsed_time": "0:16:26", "remaining_time": "2:23:20"} -{"current_steps": 212, "total_steps": 2050, "loss": 0.633, "lr": 4.870437426153113e-06, "epoch": 1.0341463414634147, "percentage": 10.34, "elapsed_time": "0:16:27", "remaining_time": "2:22:43"} -{"current_steps": 213, "total_steps": 2050, "loss": 0.842, "lr": 4.869217281028045e-06, "epoch": 1.0390243902439025, "percentage": 10.39, "elapsed_time": "0:16:34", "remaining_time": "2:22:54"} -{"current_steps": 214, "total_steps": 2050, "loss": 0.8371, "lr": 4.867991571778592e-06, "epoch": 1.0439024390243903, "percentage": 10.44, "elapsed_time": "0:16:41", "remaining_time": "2:23:11"} -{"current_steps": 215, "total_steps": 2050, "loss": 0.4728, "lr": 4.866760301283342e-06, "epoch": 1.048780487804878, "percentage": 10.49, "elapsed_time": "0:16:44", "remaining_time": "2:22:56"} -{"current_steps": 216, "total_steps": 2050, "loss": 0.651, "lr": 4.865523472433942e-06, "epoch": 1.053658536585366, "percentage": 10.54, "elapsed_time": "0:16:49", "remaining_time": "2:22:54"} -{"current_steps": 217, "total_steps": 2050, "loss": 0.6361, "lr": 4.8642810881350935e-06, "epoch": 1.0585365853658537, "percentage": 10.59, "elapsed_time": "0:16:53", "remaining_time": "2:22:39"} -{"current_steps": 218, "total_steps": 2050, "loss": 0.6206, "lr": 4.863033151304546e-06, "epoch": 1.0634146341463415, "percentage": 10.63, "elapsed_time": "0:16:55", "remaining_time": "2:22:14"} -{"current_steps": 219, "total_steps": 2050, "loss": 0.7782, "lr": 4.861779664873088e-06, "epoch": 1.0682926829268293, "percentage": 10.68, "elapsed_time": "0:16:57", "remaining_time": "2:21:49"} -{"current_steps": 220, "total_steps": 2050, "loss": 0.8504, "lr": 4.8605206317845425e-06, "epoch": 1.0731707317073171, "percentage": 10.73, "elapsed_time": "0:17:04", "remaining_time": "2:21:59"} -{"current_steps": 221, "total_steps": 2050, "loss": 0.7771, "lr": 4.859256054995758e-06, "epoch": 1.078048780487805, "percentage": 10.78, "elapsed_time": "0:17:07", "remaining_time": "2:21:44"} -{"current_steps": 222, "total_steps": 2050, "loss": 0.4308, "lr": 4.8579859374766e-06, "epoch": 1.0829268292682928, "percentage": 10.83, "elapsed_time": "0:17:10", "remaining_time": "2:21:24"} -{"current_steps": 223, "total_steps": 2050, "loss": 0.3739, "lr": 4.856710282209952e-06, "epoch": 1.0878048780487806, "percentage": 10.88, "elapsed_time": "0:17:13", "remaining_time": "2:21:08"} -{"current_steps": 224, "total_steps": 2050, "loss": 0.6548, "lr": 4.855429092191698e-06, "epoch": 1.0926829268292684, "percentage": 10.93, "elapsed_time": "0:17:17", "remaining_time": "2:21:01"} -{"current_steps": 225, "total_steps": 2050, "loss": 0.6932, "lr": 4.854142370430725e-06, "epoch": 1.0975609756097562, "percentage": 10.98, "elapsed_time": "0:17:21", "remaining_time": "2:20:45"} -{"current_steps": 226, "total_steps": 2050, "loss": 0.6491, "lr": 4.8528501199489045e-06, "epoch": 1.102439024390244, "percentage": 11.02, "elapsed_time": "0:17:22", "remaining_time": "2:20:17"} -{"current_steps": 227, "total_steps": 2050, "loss": 0.7946, "lr": 4.851552343781099e-06, "epoch": 1.1073170731707318, "percentage": 11.07, "elapsed_time": "0:17:24", "remaining_time": "2:19:51"} -{"current_steps": 228, "total_steps": 2050, "loss": 0.7629, "lr": 4.850249044975145e-06, "epoch": 1.1121951219512196, "percentage": 11.12, "elapsed_time": "0:17:28", "remaining_time": "2:19:40"} -{"current_steps": 229, "total_steps": 2050, "loss": 0.9114, "lr": 4.848940226591849e-06, "epoch": 1.1170731707317074, "percentage": 11.17, "elapsed_time": "0:17:30", "remaining_time": "2:19:15"} -{"current_steps": 230, "total_steps": 2050, "loss": 0.535, "lr": 4.847625891704982e-06, "epoch": 1.1219512195121952, "percentage": 11.22, "elapsed_time": "0:17:32", "remaining_time": "2:18:49"} -{"current_steps": 231, "total_steps": 2050, "loss": 0.7134, "lr": 4.846306043401268e-06, "epoch": 1.126829268292683, "percentage": 11.27, "elapsed_time": "0:17:36", "remaining_time": "2:18:38"} -{"current_steps": 232, "total_steps": 2050, "loss": 0.5375, "lr": 4.844980684780381e-06, "epoch": 1.1317073170731708, "percentage": 11.32, "elapsed_time": "0:17:40", "remaining_time": "2:18:28"} -{"current_steps": 233, "total_steps": 2050, "loss": 0.5486, "lr": 4.8436498189549345e-06, "epoch": 1.1365853658536587, "percentage": 11.37, "elapsed_time": "0:17:41", "remaining_time": "2:17:59"} -{"current_steps": 234, "total_steps": 2050, "loss": 0.5203, "lr": 4.842313449050477e-06, "epoch": 1.1414634146341462, "percentage": 11.41, "elapsed_time": "0:17:45", "remaining_time": "2:17:45"} -{"current_steps": 235, "total_steps": 2050, "loss": 0.4978, "lr": 4.840971578205486e-06, "epoch": 1.146341463414634, "percentage": 11.46, "elapsed_time": "0:17:48", "remaining_time": "2:17:34"} -{"current_steps": 236, "total_steps": 2050, "loss": 0.348, "lr": 4.839624209571352e-06, "epoch": 1.1512195121951219, "percentage": 11.51, "elapsed_time": "0:17:51", "remaining_time": "2:17:14"} -{"current_steps": 237, "total_steps": 2050, "loss": 0.8068, "lr": 4.838271346312381e-06, "epoch": 1.1560975609756097, "percentage": 11.56, "elapsed_time": "0:17:57", "remaining_time": "2:17:24"} -{"current_steps": 238, "total_steps": 2050, "loss": 0.8823, "lr": 4.836912991605782e-06, "epoch": 1.1609756097560975, "percentage": 11.61, "elapsed_time": "0:18:03", "remaining_time": "2:17:28"} -{"current_steps": 239, "total_steps": 2050, "loss": 0.501, "lr": 4.835549148641663e-06, "epoch": 1.1658536585365853, "percentage": 11.66, "elapsed_time": "0:18:06", "remaining_time": "2:17:11"} -{"current_steps": 240, "total_steps": 2050, "loss": 0.6406, "lr": 4.834179820623018e-06, "epoch": 1.170731707317073, "percentage": 11.71, "elapsed_time": "0:18:08", "remaining_time": "2:16:47"} -{"current_steps": 241, "total_steps": 2050, "loss": 0.537, "lr": 4.832805010765724e-06, "epoch": 1.175609756097561, "percentage": 11.76, "elapsed_time": "0:18:13", "remaining_time": "2:16:48"} -{"current_steps": 242, "total_steps": 2050, "loss": 0.6464, "lr": 4.831424722298531e-06, "epoch": 1.1804878048780487, "percentage": 11.8, "elapsed_time": "0:18:20", "remaining_time": "2:17:02"} -{"current_steps": 243, "total_steps": 2050, "loss": 0.6888, "lr": 4.830038958463061e-06, "epoch": 1.1853658536585365, "percentage": 11.85, "elapsed_time": "0:18:26", "remaining_time": "2:17:05"} -{"current_steps": 244, "total_steps": 2050, "loss": 0.8342, "lr": 4.828647722513785e-06, "epoch": 1.1902439024390243, "percentage": 11.9, "elapsed_time": "0:18:29", "remaining_time": "2:16:51"} -{"current_steps": 245, "total_steps": 2050, "loss": 0.7849, "lr": 4.827251017718034e-06, "epoch": 1.1951219512195121, "percentage": 11.95, "elapsed_time": "0:18:33", "remaining_time": "2:16:40"} -{"current_steps": 246, "total_steps": 2050, "loss": 0.7995, "lr": 4.8258488473559794e-06, "epoch": 1.2, "percentage": 12.0, "elapsed_time": "0:18:35", "remaining_time": "2:16:17"} -{"current_steps": 247, "total_steps": 2050, "loss": 0.8718, "lr": 4.824441214720629e-06, "epoch": 1.2048780487804878, "percentage": 12.05, "elapsed_time": "0:18:38", "remaining_time": "2:16:04"} -{"current_steps": 248, "total_steps": 2050, "loss": 0.3731, "lr": 4.823028123117818e-06, "epoch": 1.2097560975609756, "percentage": 12.1, "elapsed_time": "0:18:43", "remaining_time": "2:16:01"} -{"current_steps": 249, "total_steps": 2050, "loss": 0.7321, "lr": 4.8216095758662015e-06, "epoch": 1.2146341463414634, "percentage": 12.15, "elapsed_time": "0:18:49", "remaining_time": "2:16:08"} -{"current_steps": 250, "total_steps": 2050, "loss": 0.7561, "lr": 4.82018557629725e-06, "epoch": 1.2195121951219512, "percentage": 12.2, "elapsed_time": "0:18:54", "remaining_time": "2:16:09"} -{"current_steps": 251, "total_steps": 2050, "loss": 0.638, "lr": 4.8187561277552376e-06, "epoch": 1.224390243902439, "percentage": 12.24, "elapsed_time": "0:19:00", "remaining_time": "2:16:13"} -{"current_steps": 252, "total_steps": 2050, "loss": 0.6996, "lr": 4.817321233597232e-06, "epoch": 1.2292682926829268, "percentage": 12.29, "elapsed_time": "0:19:04", "remaining_time": "2:16:04"} -{"current_steps": 253, "total_steps": 2050, "loss": 0.5432, "lr": 4.815880897193095e-06, "epoch": 1.2341463414634146, "percentage": 12.34, "elapsed_time": "0:19:10", "remaining_time": "2:16:13"} -{"current_steps": 254, "total_steps": 2050, "loss": 0.781, "lr": 4.814435121925466e-06, "epoch": 1.2390243902439024, "percentage": 12.39, "elapsed_time": "0:19:18", "remaining_time": "2:16:29"} -{"current_steps": 255, "total_steps": 2050, "loss": 0.6884, "lr": 4.812983911189761e-06, "epoch": 1.2439024390243902, "percentage": 12.44, "elapsed_time": "0:19:23", "remaining_time": "2:16:32"} -{"current_steps": 256, "total_steps": 2050, "loss": 0.4984, "lr": 4.811527268394157e-06, "epoch": 1.248780487804878, "percentage": 12.49, "elapsed_time": "0:19:26", "remaining_time": "2:16:12"} -{"current_steps": 257, "total_steps": 2050, "loss": 0.553, "lr": 4.810065196959591e-06, "epoch": 1.2536585365853659, "percentage": 12.54, "elapsed_time": "0:19:27", "remaining_time": "2:15:45"} -{"current_steps": 258, "total_steps": 2050, "loss": 0.7955, "lr": 4.8085977003197496e-06, "epoch": 1.2585365853658537, "percentage": 12.59, "elapsed_time": "0:19:30", "remaining_time": "2:15:32"} -{"current_steps": 259, "total_steps": 2050, "loss": 0.9715, "lr": 4.807124781921059e-06, "epoch": 1.2634146341463415, "percentage": 12.63, "elapsed_time": "0:19:33", "remaining_time": "2:15:14"} -{"current_steps": 260, "total_steps": 2050, "loss": 0.6306, "lr": 4.805646445222679e-06, "epoch": 1.2682926829268293, "percentage": 12.68, "elapsed_time": "0:19:39", "remaining_time": "2:15:17"} -{"current_steps": 261, "total_steps": 2050, "loss": 0.5192, "lr": 4.804162693696494e-06, "epoch": 1.273170731707317, "percentage": 12.73, "elapsed_time": "0:19:45", "remaining_time": "2:15:26"} -{"current_steps": 262, "total_steps": 2050, "loss": 0.5369, "lr": 4.802673530827105e-06, "epoch": 1.278048780487805, "percentage": 12.78, "elapsed_time": "0:19:48", "remaining_time": "2:15:11"} -{"current_steps": 263, "total_steps": 2050, "loss": 0.5864, "lr": 4.801178960111823e-06, "epoch": 1.2829268292682927, "percentage": 12.83, "elapsed_time": "0:19:51", "remaining_time": "2:14:54"} -{"current_steps": 264, "total_steps": 2050, "loss": 0.7864, "lr": 4.799678985060658e-06, "epoch": 1.2878048780487805, "percentage": 12.88, "elapsed_time": "0:19:55", "remaining_time": "2:14:45"} -{"current_steps": 265, "total_steps": 2050, "loss": 0.8198, "lr": 4.798173609196314e-06, "epoch": 1.2926829268292683, "percentage": 12.93, "elapsed_time": "0:19:59", "remaining_time": "2:14:42"} -{"current_steps": 266, "total_steps": 2050, "loss": 0.4621, "lr": 4.796662836054176e-06, "epoch": 1.2975609756097561, "percentage": 12.98, "elapsed_time": "0:20:02", "remaining_time": "2:14:24"} -{"current_steps": 267, "total_steps": 2050, "loss": 0.6237, "lr": 4.795146669182304e-06, "epoch": 1.302439024390244, "percentage": 13.02, "elapsed_time": "0:20:08", "remaining_time": "2:14:27"} -{"current_steps": 268, "total_steps": 2050, "loss": 0.4981, "lr": 4.793625112141431e-06, "epoch": 1.3073170731707318, "percentage": 13.07, "elapsed_time": "0:20:11", "remaining_time": "2:14:18"} -{"current_steps": 269, "total_steps": 2050, "loss": 0.5384, "lr": 4.792098168504943e-06, "epoch": 1.3121951219512196, "percentage": 13.12, "elapsed_time": "0:20:13", "remaining_time": "2:13:56"} -{"current_steps": 270, "total_steps": 2050, "loss": 0.5535, "lr": 4.790565841858879e-06, "epoch": 1.3170731707317074, "percentage": 13.17, "elapsed_time": "0:20:20", "remaining_time": "2:14:04"} -{"current_steps": 271, "total_steps": 2050, "loss": 0.7492, "lr": 4.789028135801919e-06, "epoch": 1.3219512195121952, "percentage": 13.22, "elapsed_time": "0:20:22", "remaining_time": "2:13:45"} -{"current_steps": 272, "total_steps": 2050, "loss": 0.8367, "lr": 4.787485053945377e-06, "epoch": 1.326829268292683, "percentage": 13.27, "elapsed_time": "0:20:24", "remaining_time": "2:13:22"} -{"current_steps": 273, "total_steps": 2050, "loss": 0.6875, "lr": 4.785936599913193e-06, "epoch": 1.3317073170731708, "percentage": 13.32, "elapsed_time": "0:20:29", "remaining_time": "2:13:24"} -{"current_steps": 274, "total_steps": 2050, "loss": 0.733, "lr": 4.784382777341922e-06, "epoch": 1.3365853658536586, "percentage": 13.37, "elapsed_time": "0:20:34", "remaining_time": "2:13:23"} -{"current_steps": 275, "total_steps": 2050, "loss": 0.9719, "lr": 4.782823589880729e-06, "epoch": 1.3414634146341464, "percentage": 13.41, "elapsed_time": "0:20:37", "remaining_time": "2:13:05"} -{"current_steps": 276, "total_steps": 2050, "loss": 0.6979, "lr": 4.7812590411913755e-06, "epoch": 1.346341463414634, "percentage": 13.46, "elapsed_time": "0:20:41", "remaining_time": "2:12:58"} -{"current_steps": 277, "total_steps": 2050, "loss": 0.9697, "lr": 4.779689134948217e-06, "epoch": 1.3512195121951218, "percentage": 13.51, "elapsed_time": "0:20:45", "remaining_time": "2:12:50"} -{"current_steps": 278, "total_steps": 2050, "loss": 0.4799, "lr": 4.77811387483819e-06, "epoch": 1.3560975609756096, "percentage": 13.56, "elapsed_time": "0:20:48", "remaining_time": "2:12:38"} -{"current_steps": 279, "total_steps": 2050, "loss": 0.7478, "lr": 4.776533264560804e-06, "epoch": 1.3609756097560974, "percentage": 13.61, "elapsed_time": "0:20:50", "remaining_time": "2:12:14"} -{"current_steps": 280, "total_steps": 2050, "loss": 0.8622, "lr": 4.774947307828134e-06, "epoch": 1.3658536585365852, "percentage": 13.66, "elapsed_time": "0:20:53", "remaining_time": "2:12:06"} -{"current_steps": 281, "total_steps": 2050, "loss": 0.5792, "lr": 4.773356008364812e-06, "epoch": 1.370731707317073, "percentage": 13.71, "elapsed_time": "0:20:58", "remaining_time": "2:12:03"} -{"current_steps": 282, "total_steps": 2050, "loss": 0.4368, "lr": 4.771759369908017e-06, "epoch": 1.3756097560975609, "percentage": 13.76, "elapsed_time": "0:21:00", "remaining_time": "2:11:41"} -{"current_steps": 283, "total_steps": 2050, "loss": 0.6337, "lr": 4.7701573962074635e-06, "epoch": 1.3804878048780487, "percentage": 13.8, "elapsed_time": "0:21:02", "remaining_time": "2:11:21"} -{"current_steps": 284, "total_steps": 2050, "loss": 0.5042, "lr": 4.7685500910254015e-06, "epoch": 1.3853658536585365, "percentage": 13.85, "elapsed_time": "0:21:07", "remaining_time": "2:11:19"} -{"current_steps": 285, "total_steps": 2050, "loss": 0.7427, "lr": 4.766937458136598e-06, "epoch": 1.3902439024390243, "percentage": 13.9, "elapsed_time": "0:21:12", "remaining_time": "2:11:21"} -{"current_steps": 286, "total_steps": 2050, "loss": 0.6956, "lr": 4.765319501328332e-06, "epoch": 1.395121951219512, "percentage": 13.95, "elapsed_time": "0:21:15", "remaining_time": "2:11:08"} -{"current_steps": 287, "total_steps": 2050, "loss": 0.5152, "lr": 4.763696224400391e-06, "epoch": 1.4, "percentage": 14.0, "elapsed_time": "0:21:18", "remaining_time": "2:10:55"} -{"current_steps": 288, "total_steps": 2050, "loss": 0.5583, "lr": 4.762067631165049e-06, "epoch": 1.4048780487804877, "percentage": 14.05, "elapsed_time": "0:21:22", "remaining_time": "2:10:48"} -{"current_steps": 289, "total_steps": 2050, "loss": 0.6824, "lr": 4.760433725447071e-06, "epoch": 1.4097560975609755, "percentage": 14.1, "elapsed_time": "0:21:28", "remaining_time": "2:10:50"} -{"current_steps": 290, "total_steps": 2050, "loss": 0.7914, "lr": 4.758794511083697e-06, "epoch": 1.4146341463414633, "percentage": 14.15, "elapsed_time": "0:21:29", "remaining_time": "2:10:27"} -{"current_steps": 291, "total_steps": 2050, "loss": 0.6827, "lr": 4.757149991924633e-06, "epoch": 1.4195121951219511, "percentage": 14.2, "elapsed_time": "0:21:31", "remaining_time": "2:10:06"} -{"current_steps": 292, "total_steps": 2050, "loss": 0.5908, "lr": 4.755500171832045e-06, "epoch": 1.424390243902439, "percentage": 14.24, "elapsed_time": "0:21:34", "remaining_time": "2:09:54"} -{"current_steps": 293, "total_steps": 2050, "loss": 0.6469, "lr": 4.753845054680548e-06, "epoch": 1.4292682926829268, "percentage": 14.29, "elapsed_time": "0:21:39", "remaining_time": "2:09:51"} -{"current_steps": 294, "total_steps": 2050, "loss": 0.5412, "lr": 4.752184644357197e-06, "epoch": 1.4341463414634146, "percentage": 14.34, "elapsed_time": "0:21:41", "remaining_time": "2:09:35"} -{"current_steps": 295, "total_steps": 2050, "loss": 0.5324, "lr": 4.750518944761477e-06, "epoch": 1.4390243902439024, "percentage": 14.39, "elapsed_time": "0:21:46", "remaining_time": "2:09:35"} -{"current_steps": 296, "total_steps": 2050, "loss": 0.5317, "lr": 4.748847959805297e-06, "epoch": 1.4439024390243902, "percentage": 14.44, "elapsed_time": "0:21:50", "remaining_time": "2:09:24"} -{"current_steps": 297, "total_steps": 2050, "loss": 0.5199, "lr": 4.7471716934129774e-06, "epoch": 1.448780487804878, "percentage": 14.49, "elapsed_time": "0:21:54", "remaining_time": "2:09:18"} -{"current_steps": 298, "total_steps": 2050, "loss": 0.4874, "lr": 4.745490149521242e-06, "epoch": 1.4536585365853658, "percentage": 14.54, "elapsed_time": "0:21:56", "remaining_time": "2:09:01"} -{"current_steps": 299, "total_steps": 2050, "loss": 0.5416, "lr": 4.743803332079209e-06, "epoch": 1.4585365853658536, "percentage": 14.59, "elapsed_time": "0:21:59", "remaining_time": "2:08:48"} -{"current_steps": 300, "total_steps": 2050, "loss": 0.5628, "lr": 4.742111245048382e-06, "epoch": 1.4634146341463414, "percentage": 14.63, "elapsed_time": "0:22:05", "remaining_time": "2:08:53"} -{"current_steps": 301, "total_steps": 2050, "loss": 0.5847, "lr": 4.740413892402639e-06, "epoch": 1.4682926829268292, "percentage": 14.68, "elapsed_time": "0:22:09", "remaining_time": "2:08:42"} -{"current_steps": 302, "total_steps": 2050, "loss": 0.5889, "lr": 4.738711278128228e-06, "epoch": 1.473170731707317, "percentage": 14.73, "elapsed_time": "0:22:11", "remaining_time": "2:08:27"} -{"current_steps": 303, "total_steps": 2050, "loss": 0.3917, "lr": 4.7370034062237476e-06, "epoch": 1.4780487804878049, "percentage": 14.78, "elapsed_time": "0:22:12", "remaining_time": "2:08:02"} -{"current_steps": 304, "total_steps": 2050, "loss": 0.5592, "lr": 4.73529028070015e-06, "epoch": 1.4829268292682927, "percentage": 14.83, "elapsed_time": "0:22:15", "remaining_time": "2:07:51"} -{"current_steps": 305, "total_steps": 2050, "loss": 0.843, "lr": 4.733571905580723e-06, "epoch": 1.4878048780487805, "percentage": 14.88, "elapsed_time": "0:22:18", "remaining_time": "2:07:37"} -{"current_steps": 306, "total_steps": 2050, "loss": 0.7041, "lr": 4.731848284901082e-06, "epoch": 1.4926829268292683, "percentage": 14.93, "elapsed_time": "0:22:20", "remaining_time": "2:07:22"} -{"current_steps": 307, "total_steps": 2050, "loss": 0.4914, "lr": 4.730119422709165e-06, "epoch": 1.497560975609756, "percentage": 14.98, "elapsed_time": "0:22:22", "remaining_time": "2:07:03"} -{"current_steps": 308, "total_steps": 2050, "loss": 0.644, "lr": 4.728385323065215e-06, "epoch": 1.502439024390244, "percentage": 15.02, "elapsed_time": "0:22:25", "remaining_time": "2:06:50"} -{"current_steps": 309, "total_steps": 2050, "loss": 0.5335, "lr": 4.7266459900417815e-06, "epoch": 1.5073170731707317, "percentage": 15.07, "elapsed_time": "0:22:28", "remaining_time": "2:06:40"} -{"current_steps": 310, "total_steps": 2050, "loss": 0.8275, "lr": 4.724901427723698e-06, "epoch": 1.5121951219512195, "percentage": 15.12, "elapsed_time": "0:22:31", "remaining_time": "2:06:26"} -{"current_steps": 311, "total_steps": 2050, "loss": 0.4091, "lr": 4.723151640208084e-06, "epoch": 1.5170731707317073, "percentage": 15.17, "elapsed_time": "0:22:32", "remaining_time": "2:06:01"} -{"current_steps": 312, "total_steps": 2050, "loss": 0.4414, "lr": 4.721396631604327e-06, "epoch": 1.5219512195121951, "percentage": 15.22, "elapsed_time": "0:22:35", "remaining_time": "2:05:52"} -{"current_steps": 313, "total_steps": 2050, "loss": 0.5423, "lr": 4.7196364060340785e-06, "epoch": 1.526829268292683, "percentage": 15.27, "elapsed_time": "0:22:39", "remaining_time": "2:05:42"} -{"current_steps": 314, "total_steps": 2050, "loss": 0.8072, "lr": 4.7178709676312416e-06, "epoch": 1.5317073170731708, "percentage": 15.32, "elapsed_time": "0:22:42", "remaining_time": "2:05:32"} -{"current_steps": 315, "total_steps": 2050, "loss": 1.0254, "lr": 4.716100320541961e-06, "epoch": 1.5365853658536586, "percentage": 15.37, "elapsed_time": "0:22:45", "remaining_time": "2:05:23"} -{"current_steps": 316, "total_steps": 2050, "loss": 0.6541, "lr": 4.714324468924614e-06, "epoch": 1.5414634146341464, "percentage": 15.41, "elapsed_time": "0:22:49", "remaining_time": "2:05:14"} -{"current_steps": 317, "total_steps": 2050, "loss": 0.7519, "lr": 4.712543416949803e-06, "epoch": 1.5463414634146342, "percentage": 15.46, "elapsed_time": "0:22:52", "remaining_time": "2:05:03"} -{"current_steps": 318, "total_steps": 2050, "loss": 0.7232, "lr": 4.71075716880034e-06, "epoch": 1.551219512195122, "percentage": 15.51, "elapsed_time": "0:22:56", "remaining_time": "2:04:59"} -{"current_steps": 319, "total_steps": 2050, "loss": 0.8059, "lr": 4.708965728671243e-06, "epoch": 1.5560975609756098, "percentage": 15.56, "elapsed_time": "0:22:59", "remaining_time": "2:04:48"} -{"current_steps": 320, "total_steps": 2050, "loss": 0.6579, "lr": 4.7071691007697214e-06, "epoch": 1.5609756097560976, "percentage": 15.61, "elapsed_time": "0:23:03", "remaining_time": "2:04:39"} -{"current_steps": 321, "total_steps": 2050, "loss": 0.6989, "lr": 4.705367289315172e-06, "epoch": 1.5658536585365854, "percentage": 15.66, "elapsed_time": "0:23:06", "remaining_time": "2:04:29"} -{"current_steps": 322, "total_steps": 2050, "loss": 0.4916, "lr": 4.703560298539158e-06, "epoch": 1.5707317073170732, "percentage": 15.71, "elapsed_time": "0:23:10", "remaining_time": "2:04:19"} -{"current_steps": 323, "total_steps": 2050, "loss": 0.5076, "lr": 4.701748132685415e-06, "epoch": 1.575609756097561, "percentage": 15.76, "elapsed_time": "0:23:16", "remaining_time": "2:04:24"} -{"current_steps": 324, "total_steps": 2050, "loss": 0.559, "lr": 4.699930796009825e-06, "epoch": 1.5804878048780489, "percentage": 15.8, "elapsed_time": "0:23:20", "remaining_time": "2:04:21"} -{"current_steps": 325, "total_steps": 2050, "loss": 0.7388, "lr": 4.698108292780418e-06, "epoch": 1.5853658536585367, "percentage": 15.85, "elapsed_time": "0:23:22", "remaining_time": "2:04:01"} -{"current_steps": 326, "total_steps": 2050, "loss": 0.5469, "lr": 4.696280627277356e-06, "epoch": 1.5902439024390245, "percentage": 15.9, "elapsed_time": "0:23:24", "remaining_time": "2:03:47"} -{"current_steps": 327, "total_steps": 2050, "loss": 0.5494, "lr": 4.6944478037929255e-06, "epoch": 1.5951219512195123, "percentage": 15.95, "elapsed_time": "0:23:27", "remaining_time": "2:03:38"} -{"current_steps": 328, "total_steps": 2050, "loss": 0.7536, "lr": 4.692609826631525e-06, "epoch": 1.6, "percentage": 16.0, "elapsed_time": "0:23:31", "remaining_time": "2:03:28"} -{"current_steps": 329, "total_steps": 2050, "loss": 0.4586, "lr": 4.690766700109659e-06, "epoch": 1.604878048780488, "percentage": 16.05, "elapsed_time": "0:23:33", "remaining_time": "2:03:13"} -{"current_steps": 330, "total_steps": 2050, "loss": 0.4799, "lr": 4.6889184285559234e-06, "epoch": 1.6097560975609757, "percentage": 16.1, "elapsed_time": "0:23:39", "remaining_time": "2:03:16"} -{"current_steps": 331, "total_steps": 2050, "loss": 0.7502, "lr": 4.687065016310996e-06, "epoch": 1.6146341463414635, "percentage": 16.15, "elapsed_time": "0:23:41", "remaining_time": "2:03:00"} -{"current_steps": 332, "total_steps": 2050, "loss": 0.5923, "lr": 4.685206467727631e-06, "epoch": 1.6195121951219513, "percentage": 16.2, "elapsed_time": "0:23:43", "remaining_time": "2:02:45"} -{"current_steps": 333, "total_steps": 2050, "loss": 0.5619, "lr": 4.683342787170644e-06, "epoch": 1.6243902439024391, "percentage": 16.24, "elapsed_time": "0:23:49", "remaining_time": "2:02:51"} -{"current_steps": 334, "total_steps": 2050, "loss": 0.55, "lr": 4.6814739790169006e-06, "epoch": 1.629268292682927, "percentage": 16.29, "elapsed_time": "0:23:51", "remaining_time": "2:02:35"} -{"current_steps": 335, "total_steps": 2050, "loss": 0.7243, "lr": 4.679600047655313e-06, "epoch": 1.6341463414634148, "percentage": 16.34, "elapsed_time": "0:23:53", "remaining_time": "2:02:19"} -{"current_steps": 336, "total_steps": 2050, "loss": 1.132, "lr": 4.6777209974868194e-06, "epoch": 1.6390243902439026, "percentage": 16.39, "elapsed_time": "0:23:55", "remaining_time": "2:02:04"} -{"current_steps": 337, "total_steps": 2050, "loss": 0.55, "lr": 4.675836832924387e-06, "epoch": 1.6439024390243904, "percentage": 16.44, "elapsed_time": "0:24:00", "remaining_time": "2:02:00"} -{"current_steps": 338, "total_steps": 2050, "loss": 0.4418, "lr": 4.673947558392989e-06, "epoch": 1.6487804878048782, "percentage": 16.49, "elapsed_time": "0:24:07", "remaining_time": "2:02:11"} -{"current_steps": 339, "total_steps": 2050, "loss": 0.5897, "lr": 4.6720531783296e-06, "epoch": 1.653658536585366, "percentage": 16.54, "elapsed_time": "0:24:15", "remaining_time": "2:02:23"} -{"current_steps": 340, "total_steps": 2050, "loss": 0.5889, "lr": 4.670153697183185e-06, "epoch": 1.6585365853658538, "percentage": 16.59, "elapsed_time": "0:24:17", "remaining_time": "2:02:09"} -{"current_steps": 341, "total_steps": 2050, "loss": 0.5607, "lr": 4.668249119414692e-06, "epoch": 1.6634146341463416, "percentage": 16.63, "elapsed_time": "0:24:19", "remaining_time": "2:01:54"} -{"current_steps": 342, "total_steps": 2050, "loss": 0.6284, "lr": 4.666339449497033e-06, "epoch": 1.6682926829268294, "percentage": 16.68, "elapsed_time": "0:24:21", "remaining_time": "2:01:37"} -{"current_steps": 343, "total_steps": 2050, "loss": 0.5751, "lr": 4.664424691915084e-06, "epoch": 1.6731707317073172, "percentage": 16.73, "elapsed_time": "0:24:22", "remaining_time": "2:01:19"} -{"current_steps": 344, "total_steps": 2050, "loss": 0.586, "lr": 4.6625048511656675e-06, "epoch": 1.678048780487805, "percentage": 16.78, "elapsed_time": "0:24:24", "remaining_time": "2:01:00"} -{"current_steps": 345, "total_steps": 2050, "loss": 0.5086, "lr": 4.660579931757543e-06, "epoch": 1.6829268292682928, "percentage": 16.83, "elapsed_time": "0:24:25", "remaining_time": "2:00:42"} -{"current_steps": 346, "total_steps": 2050, "loss": 0.5934, "lr": 4.6586499382113985e-06, "epoch": 1.6878048780487804, "percentage": 16.88, "elapsed_time": "0:24:31", "remaining_time": "2:00:45"} -{"current_steps": 347, "total_steps": 2050, "loss": 0.7654, "lr": 4.6567148750598375e-06, "epoch": 1.6926829268292682, "percentage": 16.93, "elapsed_time": "0:24:34", "remaining_time": "2:00:38"} -{"current_steps": 348, "total_steps": 2050, "loss": 0.8908, "lr": 4.6547747468473705e-06, "epoch": 1.697560975609756, "percentage": 16.98, "elapsed_time": "0:24:37", "remaining_time": "2:00:24"} -{"current_steps": 349, "total_steps": 2050, "loss": 0.4383, "lr": 4.652829558130404e-06, "epoch": 1.7024390243902439, "percentage": 17.02, "elapsed_time": "0:24:38", "remaining_time": "2:00:07"} -{"current_steps": 350, "total_steps": 2050, "loss": 0.6031, "lr": 4.6508793134772265e-06, "epoch": 1.7073170731707317, "percentage": 17.07, "elapsed_time": "0:24:40", "remaining_time": "1:59:51"} -{"current_steps": 351, "total_steps": 2050, "loss": 0.533, "lr": 4.648924017468003e-06, "epoch": 1.7121951219512195, "percentage": 17.12, "elapsed_time": "0:24:46", "remaining_time": "1:59:55"} -{"current_steps": 352, "total_steps": 2050, "loss": 0.8125, "lr": 4.646963674694761e-06, "epoch": 1.7170731707317073, "percentage": 17.17, "elapsed_time": "0:24:50", "remaining_time": "1:59:51"} -{"current_steps": 353, "total_steps": 2050, "loss": 0.386, "lr": 4.64499828976138e-06, "epoch": 1.721951219512195, "percentage": 17.22, "elapsed_time": "0:24:54", "remaining_time": "1:59:42"} -{"current_steps": 354, "total_steps": 2050, "loss": 0.4792, "lr": 4.64302786728358e-06, "epoch": 1.726829268292683, "percentage": 17.27, "elapsed_time": "0:24:55", "remaining_time": "1:59:23"} -{"current_steps": 355, "total_steps": 2050, "loss": 0.5031, "lr": 4.641052411888913e-06, "epoch": 1.7317073170731707, "percentage": 17.32, "elapsed_time": "0:24:58", "remaining_time": "1:59:13"} -{"current_steps": 356, "total_steps": 2050, "loss": 0.4726, "lr": 4.6390719282167515e-06, "epoch": 1.7365853658536585, "percentage": 17.37, "elapsed_time": "0:24:59", "remaining_time": "1:58:53"} -{"current_steps": 357, "total_steps": 2050, "loss": 0.7794, "lr": 4.637086420918276e-06, "epoch": 1.7414634146341463, "percentage": 17.41, "elapsed_time": "0:25:01", "remaining_time": "1:58:40"} -{"current_steps": 358, "total_steps": 2050, "loss": 0.6202, "lr": 4.635095894656465e-06, "epoch": 1.7463414634146341, "percentage": 17.46, "elapsed_time": "0:25:04", "remaining_time": "1:58:32"} -{"current_steps": 359, "total_steps": 2050, "loss": 0.3743, "lr": 4.633100354106085e-06, "epoch": 1.751219512195122, "percentage": 17.51, "elapsed_time": "0:25:07", "remaining_time": "1:58:21"} -{"current_steps": 360, "total_steps": 2050, "loss": 0.8143, "lr": 4.631099803953677e-06, "epoch": 1.7560975609756098, "percentage": 17.56, "elapsed_time": "0:25:09", "remaining_time": "1:58:06"} -{"current_steps": 361, "total_steps": 2050, "loss": 0.4986, "lr": 4.629094248897546e-06, "epoch": 1.7609756097560976, "percentage": 17.61, "elapsed_time": "0:25:11", "remaining_time": "1:57:52"} -{"current_steps": 362, "total_steps": 2050, "loss": 0.5833, "lr": 4.627083693647757e-06, "epoch": 1.7658536585365854, "percentage": 17.66, "elapsed_time": "0:25:14", "remaining_time": "1:57:41"} -{"current_steps": 363, "total_steps": 2050, "loss": 0.885, "lr": 4.625068142926111e-06, "epoch": 1.7707317073170732, "percentage": 17.71, "elapsed_time": "0:25:17", "remaining_time": "1:57:33"} -{"current_steps": 364, "total_steps": 2050, "loss": 0.7351, "lr": 4.623047601466144e-06, "epoch": 1.775609756097561, "percentage": 17.76, "elapsed_time": "0:25:21", "remaining_time": "1:57:25"} -{"current_steps": 365, "total_steps": 2050, "loss": 0.6426, "lr": 4.621022074013114e-06, "epoch": 1.7804878048780488, "percentage": 17.8, "elapsed_time": "0:25:27", "remaining_time": "1:57:30"} -{"current_steps": 366, "total_steps": 2050, "loss": 0.5588, "lr": 4.618991565323987e-06, "epoch": 1.7853658536585366, "percentage": 17.85, "elapsed_time": "0:25:28", "remaining_time": "1:57:12"} -{"current_steps": 367, "total_steps": 2050, "loss": 0.5424, "lr": 4.616956080167426e-06, "epoch": 1.7902439024390244, "percentage": 17.9, "elapsed_time": "0:25:32", "remaining_time": "1:57:09"} -{"current_steps": 368, "total_steps": 2050, "loss": 0.8664, "lr": 4.614915623323786e-06, "epoch": 1.7951219512195122, "percentage": 17.95, "elapsed_time": "0:25:36", "remaining_time": "1:57:00"} -{"current_steps": 369, "total_steps": 2050, "loss": 0.4495, "lr": 4.612870199585092e-06, "epoch": 1.8, "percentage": 18.0, "elapsed_time": "0:25:38", "remaining_time": "1:56:49"} -{"current_steps": 370, "total_steps": 2050, "loss": 0.5099, "lr": 4.610819813755038e-06, "epoch": 1.8048780487804879, "percentage": 18.05, "elapsed_time": "0:25:40", "remaining_time": "1:56:35"} -{"current_steps": 371, "total_steps": 2050, "loss": 0.4322, "lr": 4.608764470648971e-06, "epoch": 1.8097560975609757, "percentage": 18.1, "elapsed_time": "0:25:45", "remaining_time": "1:56:33"} -{"current_steps": 372, "total_steps": 2050, "loss": 0.4744, "lr": 4.606704175093879e-06, "epoch": 1.8146341463414632, "percentage": 18.15, "elapsed_time": "0:25:46", "remaining_time": "1:56:16"} -{"current_steps": 373, "total_steps": 2050, "loss": 0.797, "lr": 4.604638931928383e-06, "epoch": 1.819512195121951, "percentage": 18.2, "elapsed_time": "0:25:48", "remaining_time": "1:56:00"} -{"current_steps": 374, "total_steps": 2050, "loss": 0.4904, "lr": 4.602568746002718e-06, "epoch": 1.8243902439024389, "percentage": 18.24, "elapsed_time": "0:25:52", "remaining_time": "1:55:55"} -{"current_steps": 375, "total_steps": 2050, "loss": 0.8682, "lr": 4.600493622178734e-06, "epoch": 1.8292682926829267, "percentage": 18.29, "elapsed_time": "0:25:54", "remaining_time": "1:55:45"} -{"current_steps": 376, "total_steps": 2050, "loss": 0.5426, "lr": 4.598413565329876e-06, "epoch": 1.8341463414634145, "percentage": 18.34, "elapsed_time": "0:25:58", "remaining_time": "1:55:39"} -{"current_steps": 377, "total_steps": 2050, "loss": 0.5628, "lr": 4.596328580341169e-06, "epoch": 1.8390243902439023, "percentage": 18.39, "elapsed_time": "0:26:01", "remaining_time": "1:55:30"} -{"current_steps": 378, "total_steps": 2050, "loss": 0.7073, "lr": 4.5942386721092195e-06, "epoch": 1.84390243902439, "percentage": 18.44, "elapsed_time": "0:26:03", "remaining_time": "1:55:17"} -{"current_steps": 379, "total_steps": 2050, "loss": 0.6526, "lr": 4.592143845542189e-06, "epoch": 1.848780487804878, "percentage": 18.49, "elapsed_time": "0:26:05", "remaining_time": "1:55:00"} -{"current_steps": 380, "total_steps": 2050, "loss": 0.8377, "lr": 4.590044105559797e-06, "epoch": 1.8536585365853657, "percentage": 18.54, "elapsed_time": "0:26:07", "remaining_time": "1:54:48"} -{"current_steps": 381, "total_steps": 2050, "loss": 0.5485, "lr": 4.587939457093296e-06, "epoch": 1.8585365853658535, "percentage": 18.59, "elapsed_time": "0:26:13", "remaining_time": "1:54:51"} -{"current_steps": 382, "total_steps": 2050, "loss": 0.6065, "lr": 4.585829905085468e-06, "epoch": 1.8634146341463413, "percentage": 18.63, "elapsed_time": "0:26:16", "remaining_time": "1:54:42"} -{"current_steps": 383, "total_steps": 2050, "loss": 0.7812, "lr": 4.5837154544906135e-06, "epoch": 1.8682926829268292, "percentage": 18.68, "elapsed_time": "0:26:18", "remaining_time": "1:54:28"} -{"current_steps": 384, "total_steps": 2050, "loss": 0.7061, "lr": 4.581596110274535e-06, "epoch": 1.873170731707317, "percentage": 18.73, "elapsed_time": "0:26:24", "remaining_time": "1:54:35"} -{"current_steps": 385, "total_steps": 2050, "loss": 0.9446, "lr": 4.579471877414527e-06, "epoch": 1.8780487804878048, "percentage": 18.78, "elapsed_time": "0:26:28", "remaining_time": "1:54:27"} -{"current_steps": 386, "total_steps": 2050, "loss": 0.78, "lr": 4.577342760899368e-06, "epoch": 1.8829268292682926, "percentage": 18.83, "elapsed_time": "0:26:28", "remaining_time": "1:54:09"} -{"current_steps": 387, "total_steps": 2050, "loss": 0.5205, "lr": 4.575208765729302e-06, "epoch": 1.8878048780487804, "percentage": 18.88, "elapsed_time": "0:26:31", "remaining_time": "1:54:00"} -{"current_steps": 388, "total_steps": 2050, "loss": 0.7827, "lr": 4.573069896916035e-06, "epoch": 1.8926829268292682, "percentage": 18.93, "elapsed_time": "0:26:38", "remaining_time": "1:54:07"} -{"current_steps": 389, "total_steps": 2050, "loss": 0.6512, "lr": 4.5709261594827125e-06, "epoch": 1.897560975609756, "percentage": 18.98, "elapsed_time": "0:26:40", "remaining_time": "1:53:56"} -{"current_steps": 390, "total_steps": 2050, "loss": 0.5548, "lr": 4.568777558463922e-06, "epoch": 1.9024390243902438, "percentage": 19.02, "elapsed_time": "0:26:47", "remaining_time": "1:54:03"} -{"current_steps": 391, "total_steps": 2050, "loss": 0.7038, "lr": 4.566624098905665e-06, "epoch": 1.9073170731707316, "percentage": 19.07, "elapsed_time": "0:26:50", "remaining_time": "1:53:51"} -{"current_steps": 392, "total_steps": 2050, "loss": 0.5416, "lr": 4.564465785865359e-06, "epoch": 1.9121951219512194, "percentage": 19.12, "elapsed_time": "0:26:51", "remaining_time": "1:53:35"} -{"current_steps": 393, "total_steps": 2050, "loss": 0.4068, "lr": 4.56230262441182e-06, "epoch": 1.9170731707317072, "percentage": 19.17, "elapsed_time": "0:26:55", "remaining_time": "1:53:30"} -{"current_steps": 394, "total_steps": 2050, "loss": 0.6197, "lr": 4.560134619625247e-06, "epoch": 1.921951219512195, "percentage": 19.22, "elapsed_time": "0:27:00", "remaining_time": "1:53:30"} -{"current_steps": 395, "total_steps": 2050, "loss": 0.5692, "lr": 4.5579617765972155e-06, "epoch": 1.9268292682926829, "percentage": 19.27, "elapsed_time": "0:27:02", "remaining_time": "1:53:18"} -{"current_steps": 396, "total_steps": 2050, "loss": 0.4836, "lr": 4.555784100430662e-06, "epoch": 1.9317073170731707, "percentage": 19.32, "elapsed_time": "0:27:04", "remaining_time": "1:53:06"} -{"current_steps": 397, "total_steps": 2050, "loss": 0.4594, "lr": 4.553601596239877e-06, "epoch": 1.9365853658536585, "percentage": 19.37, "elapsed_time": "0:27:07", "remaining_time": "1:52:58"} -{"current_steps": 398, "total_steps": 2050, "loss": 0.6053, "lr": 4.551414269150489e-06, "epoch": 1.9414634146341463, "percentage": 19.41, "elapsed_time": "0:27:11", "remaining_time": "1:52:50"} -{"current_steps": 399, "total_steps": 2050, "loss": 0.5133, "lr": 4.54922212429945e-06, "epoch": 1.946341463414634, "percentage": 19.46, "elapsed_time": "0:27:16", "remaining_time": "1:52:51"} -{"current_steps": 400, "total_steps": 2050, "loss": 0.6984, "lr": 4.547025166835027e-06, "epoch": 1.951219512195122, "percentage": 19.51, "elapsed_time": "0:27:22", "remaining_time": "1:52:53"} -{"current_steps": 401, "total_steps": 2050, "loss": 0.7944, "lr": 4.544823401916794e-06, "epoch": 1.9560975609756097, "percentage": 19.56, "elapsed_time": "0:27:25", "remaining_time": "1:52:47"} -{"current_steps": 402, "total_steps": 2050, "loss": 0.639, "lr": 4.542616834715612e-06, "epoch": 1.9609756097560975, "percentage": 19.61, "elapsed_time": "0:27:28", "remaining_time": "1:52:39"} -{"current_steps": 403, "total_steps": 2050, "loss": 0.4229, "lr": 4.540405470413618e-06, "epoch": 1.9658536585365853, "percentage": 19.66, "elapsed_time": "0:27:31", "remaining_time": "1:52:31"} -{"current_steps": 404, "total_steps": 2050, "loss": 0.7482, "lr": 4.53818931420422e-06, "epoch": 1.9707317073170731, "percentage": 19.71, "elapsed_time": "0:27:35", "remaining_time": "1:52:24"} -{"current_steps": 405, "total_steps": 2050, "loss": 0.6146, "lr": 4.535968371292076e-06, "epoch": 1.975609756097561, "percentage": 19.76, "elapsed_time": "0:27:39", "remaining_time": "1:52:18"} -{"current_steps": 406, "total_steps": 2050, "loss": 0.6964, "lr": 4.533742646893086e-06, "epoch": 1.9804878048780488, "percentage": 19.8, "elapsed_time": "0:27:44", "remaining_time": "1:52:21"} -{"current_steps": 407, "total_steps": 2050, "loss": 0.6248, "lr": 4.531512146234383e-06, "epoch": 1.9853658536585366, "percentage": 19.85, "elapsed_time": "0:27:47", "remaining_time": "1:52:11"} -{"current_steps": 408, "total_steps": 2050, "loss": 0.8715, "lr": 4.529276874554312e-06, "epoch": 1.9902439024390244, "percentage": 19.9, "elapsed_time": "0:27:51", "remaining_time": "1:52:07"} -{"current_steps": 409, "total_steps": 2050, "loss": 0.4945, "lr": 4.527036837102426e-06, "epoch": 1.9951219512195122, "percentage": 19.95, "elapsed_time": "0:27:55", "remaining_time": "1:52:03"} -{"current_steps": 410, "total_steps": 2050, "loss": 0.7085, "lr": 4.524792039139471e-06, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "0:28:01", "remaining_time": "1:52:04"} -{"current_steps": 411, "total_steps": 2050, "loss": 0.3178, "lr": 4.522542485937369e-06, "epoch": 2.004878048780488, "percentage": 20.05, "elapsed_time": "0:32:06", "remaining_time": "2:08:01"} -{"current_steps": 412, "total_steps": 2050, "loss": 0.5092, "lr": 4.520288182779214e-06, "epoch": 2.0097560975609756, "percentage": 20.1, "elapsed_time": "0:32:09", "remaining_time": "2:07:51"} -{"current_steps": 413, "total_steps": 2050, "loss": 0.314, "lr": 4.518029134959253e-06, "epoch": 2.0146341463414634, "percentage": 20.15, "elapsed_time": "0:32:13", "remaining_time": "2:07:43"} -{"current_steps": 414, "total_steps": 2050, "loss": 0.5287, "lr": 4.515765347782878e-06, "epoch": 2.0195121951219512, "percentage": 20.2, "elapsed_time": "0:32:18", "remaining_time": "2:07:41"} -{"current_steps": 415, "total_steps": 2050, "loss": 0.8221, "lr": 4.5134968265666085e-06, "epoch": 2.024390243902439, "percentage": 20.24, "elapsed_time": "0:32:21", "remaining_time": "2:07:29"} -{"current_steps": 416, "total_steps": 2050, "loss": 0.5402, "lr": 4.511223576638084e-06, "epoch": 2.029268292682927, "percentage": 20.29, "elapsed_time": "0:32:24", "remaining_time": "2:07:19"} -{"current_steps": 417, "total_steps": 2050, "loss": 0.617, "lr": 4.508945603336049e-06, "epoch": 2.0341463414634147, "percentage": 20.34, "elapsed_time": "0:32:28", "remaining_time": "2:07:09"} -{"current_steps": 418, "total_steps": 2050, "loss": 0.3541, "lr": 4.50666291201034e-06, "epoch": 2.0390243902439025, "percentage": 20.39, "elapsed_time": "0:32:33", "remaining_time": "2:07:07"} -{"current_steps": 419, "total_steps": 2050, "loss": 0.4842, "lr": 4.504375508021876e-06, "epoch": 2.0439024390243903, "percentage": 20.44, "elapsed_time": "0:32:36", "remaining_time": "2:06:57"} -{"current_steps": 420, "total_steps": 2050, "loss": 0.6168, "lr": 4.50208339674264e-06, "epoch": 2.048780487804878, "percentage": 20.49, "elapsed_time": "0:32:40", "remaining_time": "2:06:50"} -{"current_steps": 421, "total_steps": 2050, "loss": 0.6425, "lr": 4.499786583555675e-06, "epoch": 2.053658536585366, "percentage": 20.54, "elapsed_time": "0:32:45", "remaining_time": "2:06:44"} -{"current_steps": 422, "total_steps": 2050, "loss": 0.364, "lr": 4.497485073855061e-06, "epoch": 2.0585365853658537, "percentage": 20.59, "elapsed_time": "0:32:47", "remaining_time": "2:06:32"} -{"current_steps": 423, "total_steps": 2050, "loss": 0.3687, "lr": 4.495178873045913e-06, "epoch": 2.0634146341463415, "percentage": 20.63, "elapsed_time": "0:32:50", "remaining_time": "2:06:18"} -{"current_steps": 424, "total_steps": 2050, "loss": 0.4068, "lr": 4.4928679865443605e-06, "epoch": 2.0682926829268293, "percentage": 20.68, "elapsed_time": "0:32:53", "remaining_time": "2:06:08"} -{"current_steps": 425, "total_steps": 2050, "loss": 0.4759, "lr": 4.4905524197775366e-06, "epoch": 2.073170731707317, "percentage": 20.73, "elapsed_time": "0:33:00", "remaining_time": "2:06:10"} -{"current_steps": 426, "total_steps": 2050, "loss": 0.4197, "lr": 4.4882321781835666e-06, "epoch": 2.078048780487805, "percentage": 20.78, "elapsed_time": "0:33:03", "remaining_time": "2:06:01"} -{"current_steps": 427, "total_steps": 2050, "loss": 0.2294, "lr": 4.4859072672115565e-06, "epoch": 2.0829268292682928, "percentage": 20.83, "elapsed_time": "0:33:07", "remaining_time": "2:05:52"} -{"current_steps": 428, "total_steps": 2050, "loss": 0.7572, "lr": 4.483577692321577e-06, "epoch": 2.0878048780487806, "percentage": 20.88, "elapsed_time": "0:33:10", "remaining_time": "2:05:43"} -{"current_steps": 429, "total_steps": 2050, "loss": 0.4035, "lr": 4.481243458984651e-06, "epoch": 2.0926829268292684, "percentage": 20.93, "elapsed_time": "0:33:15", "remaining_time": "2:05:38"} -{"current_steps": 430, "total_steps": 2050, "loss": 0.5776, "lr": 4.478904572682743e-06, "epoch": 2.097560975609756, "percentage": 20.98, "elapsed_time": "0:33:18", "remaining_time": "2:05:27"} -{"current_steps": 431, "total_steps": 2050, "loss": 0.4005, "lr": 4.476561038908745e-06, "epoch": 2.102439024390244, "percentage": 21.02, "elapsed_time": "0:33:20", "remaining_time": "2:05:14"} -{"current_steps": 432, "total_steps": 2050, "loss": 0.5689, "lr": 4.474212863166464e-06, "epoch": 2.107317073170732, "percentage": 21.07, "elapsed_time": "0:33:23", "remaining_time": "2:05:04"} -{"current_steps": 433, "total_steps": 2050, "loss": 0.5068, "lr": 4.471860050970608e-06, "epoch": 2.1121951219512196, "percentage": 21.12, "elapsed_time": "0:33:29", "remaining_time": "2:05:03"} -{"current_steps": 434, "total_steps": 2050, "loss": 0.8349, "lr": 4.469502607846774e-06, "epoch": 2.1170731707317074, "percentage": 21.17, "elapsed_time": "0:33:31", "remaining_time": "2:04:50"} -{"current_steps": 435, "total_steps": 2050, "loss": 0.3641, "lr": 4.467140539331434e-06, "epoch": 2.1219512195121952, "percentage": 21.22, "elapsed_time": "0:33:38", "remaining_time": "2:04:52"} -{"current_steps": 436, "total_steps": 2050, "loss": 0.222, "lr": 4.464773850971924e-06, "epoch": 2.126829268292683, "percentage": 21.27, "elapsed_time": "0:33:39", "remaining_time": "2:04:34"} -{"current_steps": 437, "total_steps": 2050, "loss": 0.3799, "lr": 4.46240254832643e-06, "epoch": 2.131707317073171, "percentage": 21.32, "elapsed_time": "0:33:43", "remaining_time": "2:04:30"} -{"current_steps": 438, "total_steps": 2050, "loss": 0.4759, "lr": 4.460026636963971e-06, "epoch": 2.1365853658536587, "percentage": 21.37, "elapsed_time": "0:33:46", "remaining_time": "2:04:18"} -{"current_steps": 439, "total_steps": 2050, "loss": 0.384, "lr": 4.4576461224643965e-06, "epoch": 2.1414634146341465, "percentage": 21.41, "elapsed_time": "0:33:50", "remaining_time": "2:04:09"} -{"current_steps": 440, "total_steps": 2050, "loss": 0.391, "lr": 4.455261010418359e-06, "epoch": 2.1463414634146343, "percentage": 21.46, "elapsed_time": "0:33:53", "remaining_time": "2:04:00"} -{"current_steps": 441, "total_steps": 2050, "loss": 0.6177, "lr": 4.452871306427314e-06, "epoch": 2.151219512195122, "percentage": 21.51, "elapsed_time": "0:33:57", "remaining_time": "2:03:55"} -{"current_steps": 442, "total_steps": 2050, "loss": 0.5143, "lr": 4.450477016103498e-06, "epoch": 2.15609756097561, "percentage": 21.56, "elapsed_time": "0:33:58", "remaining_time": "2:03:36"} -{"current_steps": 443, "total_steps": 2050, "loss": 0.3783, "lr": 4.4480781450699205e-06, "epoch": 2.1609756097560977, "percentage": 21.61, "elapsed_time": "0:34:00", "remaining_time": "2:03:21"} -{"current_steps": 444, "total_steps": 2050, "loss": 0.3574, "lr": 4.4456746989603464e-06, "epoch": 2.1658536585365855, "percentage": 21.66, "elapsed_time": "0:34:01", "remaining_time": "2:03:03"} -{"current_steps": 445, "total_steps": 2050, "loss": 0.5088, "lr": 4.443266683419289e-06, "epoch": 2.1707317073170733, "percentage": 21.71, "elapsed_time": "0:34:07", "remaining_time": "2:03:05"} -{"current_steps": 446, "total_steps": 2050, "loss": 0.3773, "lr": 4.440854104101988e-06, "epoch": 2.175609756097561, "percentage": 21.76, "elapsed_time": "0:34:09", "remaining_time": "2:02:50"} -{"current_steps": 447, "total_steps": 2050, "loss": 0.5002, "lr": 4.438436966674406e-06, "epoch": 2.180487804878049, "percentage": 21.8, "elapsed_time": "0:34:12", "remaining_time": "2:02:40"} -{"current_steps": 448, "total_steps": 2050, "loss": 0.4601, "lr": 4.436015276813208e-06, "epoch": 2.1853658536585368, "percentage": 21.85, "elapsed_time": "0:34:15", "remaining_time": "2:02:30"} -{"current_steps": 449, "total_steps": 2050, "loss": 0.5422, "lr": 4.4335890402057505e-06, "epoch": 2.1902439024390246, "percentage": 21.9, "elapsed_time": "0:34:20", "remaining_time": "2:02:26"} -{"current_steps": 450, "total_steps": 2050, "loss": 0.4684, "lr": 4.431158262550067e-06, "epoch": 2.1951219512195124, "percentage": 21.95, "elapsed_time": "0:34:21", "remaining_time": "2:02:10"} -{"current_steps": 451, "total_steps": 2050, "loss": 0.2528, "lr": 4.428722949554858e-06, "epoch": 2.2, "percentage": 22.0, "elapsed_time": "0:34:23", "remaining_time": "2:01:57"} -{"current_steps": 452, "total_steps": 2050, "loss": 0.4061, "lr": 4.426283106939474e-06, "epoch": 2.204878048780488, "percentage": 22.05, "elapsed_time": "0:34:30", "remaining_time": "2:01:59"} -{"current_steps": 453, "total_steps": 2050, "loss": 0.4779, "lr": 4.423838740433903e-06, "epoch": 2.209756097560976, "percentage": 22.1, "elapsed_time": "0:34:35", "remaining_time": "2:01:57"} -{"current_steps": 454, "total_steps": 2050, "loss": 0.233, "lr": 4.4213898557787586e-06, "epoch": 2.2146341463414636, "percentage": 22.15, "elapsed_time": "0:34:37", "remaining_time": "2:01:43"} -{"current_steps": 455, "total_steps": 2050, "loss": 0.7756, "lr": 4.4189364587252636e-06, "epoch": 2.2195121951219514, "percentage": 22.2, "elapsed_time": "0:34:40", "remaining_time": "2:01:32"} -{"current_steps": 456, "total_steps": 2050, "loss": 0.2806, "lr": 4.416478555035241e-06, "epoch": 2.2243902439024392, "percentage": 22.24, "elapsed_time": "0:34:42", "remaining_time": "2:01:18"} -{"current_steps": 457, "total_steps": 2050, "loss": 0.3923, "lr": 4.4140161504810935e-06, "epoch": 2.229268292682927, "percentage": 22.29, "elapsed_time": "0:34:45", "remaining_time": "2:01:07"} -{"current_steps": 458, "total_steps": 2050, "loss": 0.289, "lr": 4.4115492508457986e-06, "epoch": 2.234146341463415, "percentage": 22.34, "elapsed_time": "0:34:48", "remaining_time": "2:01:00"} -{"current_steps": 459, "total_steps": 2050, "loss": 0.5053, "lr": 4.409077861922887e-06, "epoch": 2.2390243902439027, "percentage": 22.39, "elapsed_time": "0:34:50", "remaining_time": "2:00:45"} -{"current_steps": 460, "total_steps": 2050, "loss": 0.3363, "lr": 4.406601989516435e-06, "epoch": 2.2439024390243905, "percentage": 22.44, "elapsed_time": "0:34:52", "remaining_time": "2:00:32"} -{"current_steps": 461, "total_steps": 2050, "loss": 0.2367, "lr": 4.404121639441047e-06, "epoch": 2.2487804878048783, "percentage": 22.49, "elapsed_time": "0:34:54", "remaining_time": "2:00:20"} -{"current_steps": 462, "total_steps": 2050, "loss": 0.4942, "lr": 4.401636817521843e-06, "epoch": 2.253658536585366, "percentage": 22.54, "elapsed_time": "0:35:01", "remaining_time": "2:00:24"} -{"current_steps": 463, "total_steps": 2050, "loss": 0.3328, "lr": 4.399147529594447e-06, "epoch": 2.258536585365854, "percentage": 22.59, "elapsed_time": "0:35:07", "remaining_time": "2:00:24"} -{"current_steps": 464, "total_steps": 2050, "loss": 0.3917, "lr": 4.3966537815049686e-06, "epoch": 2.2634146341463417, "percentage": 22.63, "elapsed_time": "0:35:09", "remaining_time": "2:00:11"} -{"current_steps": 465, "total_steps": 2050, "loss": 0.5203, "lr": 4.394155579109994e-06, "epoch": 2.2682926829268295, "percentage": 22.68, "elapsed_time": "0:35:13", "remaining_time": "2:00:04"} -{"current_steps": 466, "total_steps": 2050, "loss": 0.729, "lr": 4.391652928276572e-06, "epoch": 2.2731707317073173, "percentage": 22.73, "elapsed_time": "0:35:16", "remaining_time": "1:59:55"} -{"current_steps": 467, "total_steps": 2050, "loss": 0.4822, "lr": 4.389145834882195e-06, "epoch": 2.278048780487805, "percentage": 22.78, "elapsed_time": "0:35:19", "remaining_time": "1:59:42"} -{"current_steps": 468, "total_steps": 2050, "loss": 0.3962, "lr": 4.386634304814789e-06, "epoch": 2.2829268292682925, "percentage": 22.83, "elapsed_time": "0:35:19", "remaining_time": "1:59:25"} -{"current_steps": 469, "total_steps": 2050, "loss": 0.5996, "lr": 4.384118343972704e-06, "epoch": 2.2878048780487803, "percentage": 22.88, "elapsed_time": "0:35:21", "remaining_time": "1:59:12"} -{"current_steps": 470, "total_steps": 2050, "loss": 0.6328, "lr": 4.381597958264692e-06, "epoch": 2.292682926829268, "percentage": 22.93, "elapsed_time": "0:35:27", "remaining_time": "1:59:13"} -{"current_steps": 471, "total_steps": 2050, "loss": 0.6254, "lr": 4.379073153609896e-06, "epoch": 2.297560975609756, "percentage": 22.98, "elapsed_time": "0:35:31", "remaining_time": "1:59:04"} -{"current_steps": 472, "total_steps": 2050, "loss": 0.6793, "lr": 4.37654393593784e-06, "epoch": 2.3024390243902437, "percentage": 23.02, "elapsed_time": "0:35:34", "remaining_time": "1:58:55"} -{"current_steps": 473, "total_steps": 2050, "loss": 0.4161, "lr": 4.3740103111884096e-06, "epoch": 2.3073170731707315, "percentage": 23.07, "elapsed_time": "0:35:36", "remaining_time": "1:58:43"} -{"current_steps": 474, "total_steps": 2050, "loss": 0.3329, "lr": 4.371472285311842e-06, "epoch": 2.3121951219512193, "percentage": 23.12, "elapsed_time": "0:35:39", "remaining_time": "1:58:32"} -{"current_steps": 475, "total_steps": 2050, "loss": 0.2687, "lr": 4.368929864268709e-06, "epoch": 2.317073170731707, "percentage": 23.17, "elapsed_time": "0:35:46", "remaining_time": "1:58:37"} -{"current_steps": 476, "total_steps": 2050, "loss": 0.5934, "lr": 4.366383054029907e-06, "epoch": 2.321951219512195, "percentage": 23.22, "elapsed_time": "0:35:50", "remaining_time": "1:58:32"} -{"current_steps": 477, "total_steps": 2050, "loss": 0.5033, "lr": 4.363831860576638e-06, "epoch": 2.3268292682926828, "percentage": 23.27, "elapsed_time": "0:35:54", "remaining_time": "1:58:25"} -{"current_steps": 478, "total_steps": 2050, "loss": 0.4492, "lr": 4.361276289900396e-06, "epoch": 2.3317073170731706, "percentage": 23.32, "elapsed_time": "0:35:57", "remaining_time": "1:58:16"} -{"current_steps": 479, "total_steps": 2050, "loss": 0.619, "lr": 4.358716348002962e-06, "epoch": 2.3365853658536584, "percentage": 23.37, "elapsed_time": "0:35:59", "remaining_time": "1:58:02"} -{"current_steps": 480, "total_steps": 2050, "loss": 0.4018, "lr": 4.356152040896376e-06, "epoch": 2.341463414634146, "percentage": 23.41, "elapsed_time": "0:36:03", "remaining_time": "1:57:56"} -{"current_steps": 481, "total_steps": 2050, "loss": 0.3062, "lr": 4.3535833746029335e-06, "epoch": 2.346341463414634, "percentage": 23.46, "elapsed_time": "0:36:07", "remaining_time": "1:57:50"} -{"current_steps": 482, "total_steps": 2050, "loss": 0.3387, "lr": 4.351010355155165e-06, "epoch": 2.351219512195122, "percentage": 23.51, "elapsed_time": "0:36:08", "remaining_time": "1:57:35"} -{"current_steps": 483, "total_steps": 2050, "loss": 0.3103, "lr": 4.348432988595828e-06, "epoch": 2.3560975609756096, "percentage": 23.56, "elapsed_time": "0:36:12", "remaining_time": "1:57:27"} -{"current_steps": 484, "total_steps": 2050, "loss": 0.6782, "lr": 4.345851280977885e-06, "epoch": 2.3609756097560974, "percentage": 23.61, "elapsed_time": "0:36:19", "remaining_time": "1:57:32"} -{"current_steps": 485, "total_steps": 2050, "loss": 0.3195, "lr": 4.343265238364496e-06, "epoch": 2.3658536585365852, "percentage": 23.66, "elapsed_time": "0:36:21", "remaining_time": "1:57:18"} -{"current_steps": 486, "total_steps": 2050, "loss": 0.4639, "lr": 4.340674866829001e-06, "epoch": 2.370731707317073, "percentage": 23.71, "elapsed_time": "0:36:27", "remaining_time": "1:57:19"} -{"current_steps": 487, "total_steps": 2050, "loss": 0.7229, "lr": 4.338080172454908e-06, "epoch": 2.375609756097561, "percentage": 23.76, "elapsed_time": "0:36:30", "remaining_time": "1:57:11"} -{"current_steps": 488, "total_steps": 2050, "loss": 0.4334, "lr": 4.335481161335875e-06, "epoch": 2.3804878048780487, "percentage": 23.8, "elapsed_time": "0:36:34", "remaining_time": "1:57:04"} -{"current_steps": 489, "total_steps": 2050, "loss": 0.3409, "lr": 4.332877839575699e-06, "epoch": 2.3853658536585365, "percentage": 23.85, "elapsed_time": "0:36:36", "remaining_time": "1:56:52"} -{"current_steps": 490, "total_steps": 2050, "loss": 0.5221, "lr": 4.330270213288301e-06, "epoch": 2.3902439024390243, "percentage": 23.9, "elapsed_time": "0:36:43", "remaining_time": "1:56:55"} -{"current_steps": 491, "total_steps": 2050, "loss": 0.7078, "lr": 4.32765828859771e-06, "epoch": 2.395121951219512, "percentage": 23.95, "elapsed_time": "0:36:48", "remaining_time": "1:56:52"} -{"current_steps": 492, "total_steps": 2050, "loss": 0.5902, "lr": 4.325042071638051e-06, "epoch": 2.4, "percentage": 24.0, "elapsed_time": "0:36:51", "remaining_time": "1:56:43"} -{"current_steps": 493, "total_steps": 2050, "loss": 0.3746, "lr": 4.322421568553529e-06, "epoch": 2.4048780487804877, "percentage": 24.05, "elapsed_time": "0:36:58", "remaining_time": "1:56:45"} -{"current_steps": 494, "total_steps": 2050, "loss": 0.3474, "lr": 4.319796785498416e-06, "epoch": 2.4097560975609755, "percentage": 24.1, "elapsed_time": "0:37:01", "remaining_time": "1:56:35"} -{"current_steps": 495, "total_steps": 2050, "loss": 0.5171, "lr": 4.317167728637032e-06, "epoch": 2.4146341463414633, "percentage": 24.15, "elapsed_time": "0:37:04", "remaining_time": "1:56:27"} -{"current_steps": 496, "total_steps": 2050, "loss": 0.4263, "lr": 4.314534404143738e-06, "epoch": 2.419512195121951, "percentage": 24.2, "elapsed_time": "0:37:08", "remaining_time": "1:56:21"} -{"current_steps": 497, "total_steps": 2050, "loss": 0.5072, "lr": 4.3118968182029155e-06, "epoch": 2.424390243902439, "percentage": 24.24, "elapsed_time": "0:37:11", "remaining_time": "1:56:14"} -{"current_steps": 498, "total_steps": 2050, "loss": 0.2742, "lr": 4.3092549770089566e-06, "epoch": 2.4292682926829268, "percentage": 24.29, "elapsed_time": "0:37:14", "remaining_time": "1:56:02"} -{"current_steps": 499, "total_steps": 2050, "loss": 0.4814, "lr": 4.306608886766243e-06, "epoch": 2.4341463414634146, "percentage": 24.34, "elapsed_time": "0:37:15", "remaining_time": "1:55:49"} -{"current_steps": 500, "total_steps": 2050, "loss": 0.4188, "lr": 4.303958553689137e-06, "epoch": 2.4390243902439024, "percentage": 24.39, "elapsed_time": "0:37:18", "remaining_time": "1:55:38"} -{"current_steps": 501, "total_steps": 2050, "loss": 0.6436, "lr": 4.3013039840019675e-06, "epoch": 2.44390243902439, "percentage": 24.44, "elapsed_time": "0:37:20", "remaining_time": "1:55:26"} -{"current_steps": 502, "total_steps": 2050, "loss": 0.2862, "lr": 4.2986451839390105e-06, "epoch": 2.448780487804878, "percentage": 24.49, "elapsed_time": "0:37:21", "remaining_time": "1:55:12"} -{"current_steps": 503, "total_steps": 2050, "loss": 0.4926, "lr": 4.295982159744476e-06, "epoch": 2.453658536585366, "percentage": 24.54, "elapsed_time": "0:37:27", "remaining_time": "1:55:12"} -{"current_steps": 504, "total_steps": 2050, "loss": 0.5717, "lr": 4.293314917672498e-06, "epoch": 2.4585365853658536, "percentage": 24.59, "elapsed_time": "0:37:30", "remaining_time": "1:55:04"} -{"current_steps": 505, "total_steps": 2050, "loss": 0.2707, "lr": 4.290643463987114e-06, "epoch": 2.4634146341463414, "percentage": 24.63, "elapsed_time": "0:37:33", "remaining_time": "1:54:53"} -{"current_steps": 506, "total_steps": 2050, "loss": 0.347, "lr": 4.287967804962252e-06, "epoch": 2.4682926829268292, "percentage": 24.68, "elapsed_time": "0:37:38", "remaining_time": "1:54:52"} -{"current_steps": 507, "total_steps": 2050, "loss": 0.2103, "lr": 4.285287946881718e-06, "epoch": 2.473170731707317, "percentage": 24.73, "elapsed_time": "0:37:39", "remaining_time": "1:54:36"} -{"current_steps": 508, "total_steps": 2050, "loss": 0.6405, "lr": 4.282603896039178e-06, "epoch": 2.478048780487805, "percentage": 24.78, "elapsed_time": "0:37:43", "remaining_time": "1:54:30"} -{"current_steps": 509, "total_steps": 2050, "loss": 0.4027, "lr": 4.279915658738145e-06, "epoch": 2.4829268292682927, "percentage": 24.83, "elapsed_time": "0:37:48", "remaining_time": "1:54:29"} -{"current_steps": 510, "total_steps": 2050, "loss": 0.6503, "lr": 4.277223241291966e-06, "epoch": 2.4878048780487805, "percentage": 24.88, "elapsed_time": "0:37:51", "remaining_time": "1:54:17"} -{"current_steps": 511, "total_steps": 2050, "loss": 0.5006, "lr": 4.274526650023801e-06, "epoch": 2.4926829268292683, "percentage": 24.93, "elapsed_time": "0:37:56", "remaining_time": "1:54:15"} -{"current_steps": 512, "total_steps": 2050, "loss": 0.479, "lr": 4.271825891266617e-06, "epoch": 2.497560975609756, "percentage": 24.98, "elapsed_time": "0:37:57", "remaining_time": "1:54:02"} -{"current_steps": 513, "total_steps": 2050, "loss": 0.6667, "lr": 4.269120971363164e-06, "epoch": 2.502439024390244, "percentage": 25.02, "elapsed_time": "0:38:02", "remaining_time": "1:53:57"} -{"current_steps": 514, "total_steps": 2050, "loss": 0.2977, "lr": 4.266411896665967e-06, "epoch": 2.5073170731707317, "percentage": 25.07, "elapsed_time": "0:38:04", "remaining_time": "1:53:47"} -{"current_steps": 515, "total_steps": 2050, "loss": 0.3912, "lr": 4.263698673537309e-06, "epoch": 2.5121951219512195, "percentage": 25.12, "elapsed_time": "0:38:07", "remaining_time": "1:53:39"} -{"current_steps": 516, "total_steps": 2050, "loss": 0.615, "lr": 4.260981308349214e-06, "epoch": 2.5170731707317073, "percentage": 25.17, "elapsed_time": "0:38:09", "remaining_time": "1:53:27"} -{"current_steps": 517, "total_steps": 2050, "loss": 0.4559, "lr": 4.258259807483434e-06, "epoch": 2.521951219512195, "percentage": 25.22, "elapsed_time": "0:38:15", "remaining_time": "1:53:26"} -{"current_steps": 518, "total_steps": 2050, "loss": 0.4993, "lr": 4.255534177331435e-06, "epoch": 2.526829268292683, "percentage": 25.27, "elapsed_time": "0:38:18", "remaining_time": "1:53:18"} -{"current_steps": 519, "total_steps": 2050, "loss": 0.4581, "lr": 4.252804424294378e-06, "epoch": 2.5317073170731708, "percentage": 25.32, "elapsed_time": "0:38:20", "remaining_time": "1:53:05"} -{"current_steps": 520, "total_steps": 2050, "loss": 0.5403, "lr": 4.25007055478311e-06, "epoch": 2.5365853658536586, "percentage": 25.37, "elapsed_time": "0:38:21", "remaining_time": "1:52:53"} -{"current_steps": 521, "total_steps": 2050, "loss": 0.3658, "lr": 4.247332575218144e-06, "epoch": 2.5414634146341464, "percentage": 25.41, "elapsed_time": "0:38:22", "remaining_time": "1:52:38"} -{"current_steps": 522, "total_steps": 2050, "loss": 0.6342, "lr": 4.244590492029643e-06, "epoch": 2.546341463414634, "percentage": 25.46, "elapsed_time": "0:38:29", "remaining_time": "1:52:39"} -{"current_steps": 523, "total_steps": 2050, "loss": 0.3411, "lr": 4.241844311657411e-06, "epoch": 2.551219512195122, "percentage": 25.51, "elapsed_time": "0:38:34", "remaining_time": "1:52:37"} -{"current_steps": 524, "total_steps": 2050, "loss": 0.2829, "lr": 4.239094040550875e-06, "epoch": 2.55609756097561, "percentage": 25.56, "elapsed_time": "0:38:35", "remaining_time": "1:52:23"} -{"current_steps": 525, "total_steps": 2050, "loss": 0.4749, "lr": 4.236339685169065e-06, "epoch": 2.5609756097560976, "percentage": 25.61, "elapsed_time": "0:38:41", "remaining_time": "1:52:21"} -{"current_steps": 526, "total_steps": 2050, "loss": 0.2485, "lr": 4.233581251980604e-06, "epoch": 2.5658536585365854, "percentage": 25.66, "elapsed_time": "0:38:42", "remaining_time": "1:52:08"} -{"current_steps": 527, "total_steps": 2050, "loss": 0.4488, "lr": 4.230818747463696e-06, "epoch": 2.5707317073170732, "percentage": 25.71, "elapsed_time": "0:38:45", "remaining_time": "1:52:00"} -{"current_steps": 528, "total_steps": 2050, "loss": 0.4495, "lr": 4.228052178106101e-06, "epoch": 2.575609756097561, "percentage": 25.76, "elapsed_time": "0:38:48", "remaining_time": "1:51:52"} -{"current_steps": 529, "total_steps": 2050, "loss": 0.2396, "lr": 4.2252815504051285e-06, "epoch": 2.580487804878049, "percentage": 25.8, "elapsed_time": "0:38:50", "remaining_time": "1:51:41"} -{"current_steps": 530, "total_steps": 2050, "loss": 0.6784, "lr": 4.222506870867618e-06, "epoch": 2.5853658536585367, "percentage": 25.85, "elapsed_time": "0:38:55", "remaining_time": "1:51:37"} -{"current_steps": 531, "total_steps": 2050, "loss": 0.5543, "lr": 4.2197281460099245e-06, "epoch": 2.5902439024390245, "percentage": 25.9, "elapsed_time": "0:38:58", "remaining_time": "1:51:28"} -{"current_steps": 532, "total_steps": 2050, "loss": 0.5281, "lr": 4.216945382357905e-06, "epoch": 2.5951219512195123, "percentage": 25.95, "elapsed_time": "0:39:01", "remaining_time": "1:51:22"} -{"current_steps": 533, "total_steps": 2050, "loss": 0.8019, "lr": 4.214158586446901e-06, "epoch": 2.6, "percentage": 26.0, "elapsed_time": "0:39:03", "remaining_time": "1:51:10"} -{"current_steps": 534, "total_steps": 2050, "loss": 0.7769, "lr": 4.211367764821722e-06, "epoch": 2.604878048780488, "percentage": 26.05, "elapsed_time": "0:39:05", "remaining_time": "1:50:59"} -{"current_steps": 535, "total_steps": 2050, "loss": 0.4077, "lr": 4.208572924036634e-06, "epoch": 2.6097560975609757, "percentage": 26.1, "elapsed_time": "0:39:07", "remaining_time": "1:50:47"} -{"current_steps": 536, "total_steps": 2050, "loss": 0.433, "lr": 4.2057740706553415e-06, "epoch": 2.6146341463414635, "percentage": 26.15, "elapsed_time": "0:39:13", "remaining_time": "1:50:47"} -{"current_steps": 537, "total_steps": 2050, "loss": 0.5957, "lr": 4.202971211250971e-06, "epoch": 2.6195121951219513, "percentage": 26.2, "elapsed_time": "0:39:19", "remaining_time": "1:50:47"} -{"current_steps": 538, "total_steps": 2050, "loss": 0.3013, "lr": 4.200164352406061e-06, "epoch": 2.624390243902439, "percentage": 26.24, "elapsed_time": "0:39:26", "remaining_time": "1:50:51"} -{"current_steps": 539, "total_steps": 2050, "loss": 0.5646, "lr": 4.197353500712539e-06, "epoch": 2.629268292682927, "percentage": 26.29, "elapsed_time": "0:39:29", "remaining_time": "1:50:43"} -{"current_steps": 540, "total_steps": 2050, "loss": 0.4529, "lr": 4.1945386627717115e-06, "epoch": 2.6341463414634148, "percentage": 26.34, "elapsed_time": "0:39:36", "remaining_time": "1:50:46"} -{"current_steps": 541, "total_steps": 2050, "loss": 0.6076, "lr": 4.191719845194246e-06, "epoch": 2.6390243902439026, "percentage": 26.39, "elapsed_time": "0:39:39", "remaining_time": "1:50:37"} -{"current_steps": 542, "total_steps": 2050, "loss": 0.4855, "lr": 4.188897054600156e-06, "epoch": 2.6439024390243904, "percentage": 26.44, "elapsed_time": "0:39:43", "remaining_time": "1:50:30"} -{"current_steps": 543, "total_steps": 2050, "loss": 0.5836, "lr": 4.186070297618787e-06, "epoch": 2.648780487804878, "percentage": 26.49, "elapsed_time": "0:39:45", "remaining_time": "1:50:20"} -{"current_steps": 544, "total_steps": 2050, "loss": 0.6266, "lr": 4.183239580888799e-06, "epoch": 2.653658536585366, "percentage": 26.54, "elapsed_time": "0:39:50", "remaining_time": "1:50:17"} -{"current_steps": 545, "total_steps": 2050, "loss": 0.429, "lr": 4.18040491105815e-06, "epoch": 2.658536585365854, "percentage": 26.59, "elapsed_time": "0:39:51", "remaining_time": "1:50:03"} -{"current_steps": 546, "total_steps": 2050, "loss": 0.391, "lr": 4.177566294784085e-06, "epoch": 2.6634146341463416, "percentage": 26.63, "elapsed_time": "0:39:57", "remaining_time": "1:50:02"} -{"current_steps": 547, "total_steps": 2050, "loss": 0.6548, "lr": 4.174723738733114e-06, "epoch": 2.6682926829268294, "percentage": 26.68, "elapsed_time": "0:39:58", "remaining_time": "1:49:50"} -{"current_steps": 548, "total_steps": 2050, "loss": 0.5188, "lr": 4.171877249581001e-06, "epoch": 2.6731707317073172, "percentage": 26.73, "elapsed_time": "0:40:03", "remaining_time": "1:49:48"} -{"current_steps": 549, "total_steps": 2050, "loss": 0.3494, "lr": 4.169026834012748e-06, "epoch": 2.678048780487805, "percentage": 26.78, "elapsed_time": "0:40:08", "remaining_time": "1:49:45"} -{"current_steps": 550, "total_steps": 2050, "loss": 0.3621, "lr": 4.166172498722577e-06, "epoch": 2.682926829268293, "percentage": 26.83, "elapsed_time": "0:40:10", "remaining_time": "1:49:34"} -{"current_steps": 551, "total_steps": 2050, "loss": 0.7187, "lr": 4.163314250413913e-06, "epoch": 2.68780487804878, "percentage": 26.88, "elapsed_time": "0:40:13", "remaining_time": "1:49:27"} -{"current_steps": 552, "total_steps": 2050, "loss": 0.428, "lr": 4.160452095799378e-06, "epoch": 2.692682926829268, "percentage": 26.93, "elapsed_time": "0:40:15", "remaining_time": "1:49:16"} -{"current_steps": 553, "total_steps": 2050, "loss": 0.202, "lr": 4.157586041600759e-06, "epoch": 2.697560975609756, "percentage": 26.98, "elapsed_time": "0:40:18", "remaining_time": "1:49:06"} -{"current_steps": 554, "total_steps": 2050, "loss": 0.5238, "lr": 4.154716094549008e-06, "epoch": 2.7024390243902436, "percentage": 27.02, "elapsed_time": "0:40:21", "remaining_time": "1:48:59"} -{"current_steps": 555, "total_steps": 2050, "loss": 0.3073, "lr": 4.151842261384217e-06, "epoch": 2.7073170731707314, "percentage": 27.07, "elapsed_time": "0:40:24", "remaining_time": "1:48:51"} -{"current_steps": 556, "total_steps": 2050, "loss": 0.8435, "lr": 4.148964548855603e-06, "epoch": 2.7121951219512193, "percentage": 27.12, "elapsed_time": "0:40:27", "remaining_time": "1:48:41"} -{"current_steps": 557, "total_steps": 2050, "loss": 0.2562, "lr": 4.146082963721496e-06, "epoch": 2.717073170731707, "percentage": 27.17, "elapsed_time": "0:40:31", "remaining_time": "1:48:36"} -{"current_steps": 558, "total_steps": 2050, "loss": 1.0144, "lr": 4.143197512749322e-06, "epoch": 2.721951219512195, "percentage": 27.22, "elapsed_time": "0:40:34", "remaining_time": "1:48:28"} -{"current_steps": 559, "total_steps": 2050, "loss": 0.7581, "lr": 4.140308202715581e-06, "epoch": 2.7268292682926827, "percentage": 27.27, "elapsed_time": "0:40:39", "remaining_time": "1:48:27"} -{"current_steps": 560, "total_steps": 2050, "loss": 0.3114, "lr": 4.13741504040584e-06, "epoch": 2.7317073170731705, "percentage": 27.32, "elapsed_time": "0:40:41", "remaining_time": "1:48:16"} -{"current_steps": 561, "total_steps": 2050, "loss": 0.4384, "lr": 4.134518032614713e-06, "epoch": 2.7365853658536583, "percentage": 27.37, "elapsed_time": "0:40:43", "remaining_time": "1:48:05"} -{"current_steps": 562, "total_steps": 2050, "loss": 0.3141, "lr": 4.1316171861458445e-06, "epoch": 2.741463414634146, "percentage": 27.41, "elapsed_time": "0:40:48", "remaining_time": "1:48:01"} -{"current_steps": 563, "total_steps": 2050, "loss": 0.5777, "lr": 4.128712507811893e-06, "epoch": 2.746341463414634, "percentage": 27.46, "elapsed_time": "0:40:53", "remaining_time": "1:48:01"} -{"current_steps": 564, "total_steps": 2050, "loss": 0.5542, "lr": 4.125804004434517e-06, "epoch": 2.7512195121951217, "percentage": 27.51, "elapsed_time": "0:41:00", "remaining_time": "1:48:02"} -{"current_steps": 565, "total_steps": 2050, "loss": 0.3442, "lr": 4.12289168284436e-06, "epoch": 2.7560975609756095, "percentage": 27.56, "elapsed_time": "0:41:06", "remaining_time": "1:48:02"} -{"current_steps": 566, "total_steps": 2050, "loss": 0.4754, "lr": 4.119975549881029e-06, "epoch": 2.7609756097560973, "percentage": 27.61, "elapsed_time": "0:41:10", "remaining_time": "1:47:56"} -{"current_steps": 567, "total_steps": 2050, "loss": 0.2988, "lr": 4.1170556123930846e-06, "epoch": 2.765853658536585, "percentage": 27.66, "elapsed_time": "0:41:13", "remaining_time": "1:47:50"} -{"current_steps": 568, "total_steps": 2050, "loss": 0.4642, "lr": 4.114131877238021e-06, "epoch": 2.770731707317073, "percentage": 27.71, "elapsed_time": "0:41:16", "remaining_time": "1:47:40"} -{"current_steps": 569, "total_steps": 2050, "loss": 0.3493, "lr": 4.111204351282254e-06, "epoch": 2.7756097560975608, "percentage": 27.76, "elapsed_time": "0:41:21", "remaining_time": "1:47:39"} -{"current_steps": 570, "total_steps": 2050, "loss": 0.4007, "lr": 4.108273041401098e-06, "epoch": 2.7804878048780486, "percentage": 27.8, "elapsed_time": "0:41:27", "remaining_time": "1:47:38"} -{"current_steps": 571, "total_steps": 2050, "loss": 0.7815, "lr": 4.105337954478756e-06, "epoch": 2.7853658536585364, "percentage": 27.85, "elapsed_time": "0:41:29", "remaining_time": "1:47:28"} -{"current_steps": 572, "total_steps": 2050, "loss": 0.6099, "lr": 4.102399097408304e-06, "epoch": 2.790243902439024, "percentage": 27.9, "elapsed_time": "0:41:32", "remaining_time": "1:47:21"} -{"current_steps": 573, "total_steps": 2050, "loss": 0.2478, "lr": 4.099456477091667e-06, "epoch": 2.795121951219512, "percentage": 27.95, "elapsed_time": "0:41:34", "remaining_time": "1:47:08"} -{"current_steps": 574, "total_steps": 2050, "loss": 0.6403, "lr": 4.096510100439611e-06, "epoch": 2.8, "percentage": 28.0, "elapsed_time": "0:41:35", "remaining_time": "1:46:58"} -{"current_steps": 575, "total_steps": 2050, "loss": 0.2509, "lr": 4.093559974371725e-06, "epoch": 2.8048780487804876, "percentage": 28.05, "elapsed_time": "0:41:38", "remaining_time": "1:46:48"} -{"current_steps": 576, "total_steps": 2050, "loss": 0.7552, "lr": 4.0906061058164e-06, "epoch": 2.8097560975609754, "percentage": 28.1, "elapsed_time": "0:41:44", "remaining_time": "1:46:49"} -{"current_steps": 577, "total_steps": 2050, "loss": 0.3146, "lr": 4.087648501710819e-06, "epoch": 2.8146341463414632, "percentage": 28.15, "elapsed_time": "0:41:46", "remaining_time": "1:46:39"} -{"current_steps": 578, "total_steps": 2050, "loss": 0.507, "lr": 4.084687169000938e-06, "epoch": 2.819512195121951, "percentage": 28.2, "elapsed_time": "0:41:49", "remaining_time": "1:46:30"} -{"current_steps": 579, "total_steps": 2050, "loss": 0.4116, "lr": 4.081722114641469e-06, "epoch": 2.824390243902439, "percentage": 28.24, "elapsed_time": "0:41:51", "remaining_time": "1:46:21"} -{"current_steps": 580, "total_steps": 2050, "loss": 0.2264, "lr": 4.0787533455958626e-06, "epoch": 2.8292682926829267, "percentage": 28.29, "elapsed_time": "0:41:53", "remaining_time": "1:46:09"} -{"current_steps": 581, "total_steps": 2050, "loss": 0.3197, "lr": 4.075780868836296e-06, "epoch": 2.8341463414634145, "percentage": 28.34, "elapsed_time": "0:41:55", "remaining_time": "1:46:00"} -{"current_steps": 582, "total_steps": 2050, "loss": 0.4045, "lr": 4.072804691343653e-06, "epoch": 2.8390243902439023, "percentage": 28.39, "elapsed_time": "0:41:57", "remaining_time": "1:45:48"} -{"current_steps": 583, "total_steps": 2050, "loss": 0.9564, "lr": 4.069824820107507e-06, "epoch": 2.84390243902439, "percentage": 28.44, "elapsed_time": "0:42:01", "remaining_time": "1:45:44"} -{"current_steps": 584, "total_steps": 2050, "loss": 0.2703, "lr": 4.06684126212611e-06, "epoch": 2.848780487804878, "percentage": 28.49, "elapsed_time": "0:42:05", "remaining_time": "1:45:39"} -{"current_steps": 585, "total_steps": 2050, "loss": 0.4828, "lr": 4.063854024406369e-06, "epoch": 2.8536585365853657, "percentage": 28.54, "elapsed_time": "0:42:09", "remaining_time": "1:45:34"} -{"current_steps": 586, "total_steps": 2050, "loss": 0.4131, "lr": 4.060863113963835e-06, "epoch": 2.8585365853658535, "percentage": 28.59, "elapsed_time": "0:42:15", "remaining_time": "1:45:33"} -{"current_steps": 587, "total_steps": 2050, "loss": 0.4464, "lr": 4.057868537822683e-06, "epoch": 2.8634146341463413, "percentage": 28.63, "elapsed_time": "0:42:18", "remaining_time": "1:45:27"} -{"current_steps": 588, "total_steps": 2050, "loss": 0.2825, "lr": 4.054870303015695e-06, "epoch": 2.868292682926829, "percentage": 28.68, "elapsed_time": "0:42:20", "remaining_time": "1:45:16"} -{"current_steps": 589, "total_steps": 2050, "loss": 0.4438, "lr": 4.05186841658425e-06, "epoch": 2.873170731707317, "percentage": 28.73, "elapsed_time": "0:42:23", "remaining_time": "1:45:09"} -{"current_steps": 590, "total_steps": 2050, "loss": 0.4817, "lr": 4.048862885578301e-06, "epoch": 2.8780487804878048, "percentage": 28.78, "elapsed_time": "0:42:28", "remaining_time": "1:45:06"} -{"current_steps": 591, "total_steps": 2050, "loss": 0.5157, "lr": 4.045853717056358e-06, "epoch": 2.8829268292682926, "percentage": 28.83, "elapsed_time": "0:42:31", "remaining_time": "1:44:58"} -{"current_steps": 592, "total_steps": 2050, "loss": 0.4029, "lr": 4.0428409180854775e-06, "epoch": 2.8878048780487804, "percentage": 28.88, "elapsed_time": "0:42:37", "remaining_time": "1:44:59"} -{"current_steps": 593, "total_steps": 2050, "loss": 0.3796, "lr": 4.039824495741238e-06, "epoch": 2.892682926829268, "percentage": 28.93, "elapsed_time": "0:42:44", "remaining_time": "1:45:00"} -{"current_steps": 594, "total_steps": 2050, "loss": 0.4467, "lr": 4.036804457107733e-06, "epoch": 2.897560975609756, "percentage": 28.98, "elapsed_time": "0:42:47", "remaining_time": "1:44:53"} -{"current_steps": 595, "total_steps": 2050, "loss": 0.7007, "lr": 4.0337808092775435e-06, "epoch": 2.902439024390244, "percentage": 29.02, "elapsed_time": "0:42:49", "remaining_time": "1:44:43"} -{"current_steps": 596, "total_steps": 2050, "loss": 0.3219, "lr": 4.030753559351728e-06, "epoch": 2.9073170731707316, "percentage": 29.07, "elapsed_time": "0:42:52", "remaining_time": "1:44:35"} -{"current_steps": 597, "total_steps": 2050, "loss": 0.3038, "lr": 4.027722714439808e-06, "epoch": 2.9121951219512194, "percentage": 29.12, "elapsed_time": "0:42:55", "remaining_time": "1:44:27"} -{"current_steps": 598, "total_steps": 2050, "loss": 0.7768, "lr": 4.024688281659743e-06, "epoch": 2.9170731707317072, "percentage": 29.17, "elapsed_time": "0:42:57", "remaining_time": "1:44:19"} -{"current_steps": 599, "total_steps": 2050, "loss": 0.4667, "lr": 4.021650268137924e-06, "epoch": 2.921951219512195, "percentage": 29.22, "elapsed_time": "0:42:59", "remaining_time": "1:44:07"} -{"current_steps": 600, "total_steps": 2050, "loss": 0.3852, "lr": 4.018608681009143e-06, "epoch": 2.926829268292683, "percentage": 29.27, "elapsed_time": "0:43:01", "remaining_time": "1:43:59"} -{"current_steps": 601, "total_steps": 2050, "loss": 0.4804, "lr": 4.015563527416596e-06, "epoch": 2.9317073170731707, "percentage": 29.32, "elapsed_time": "0:43:02", "remaining_time": "1:43:47"} -{"current_steps": 602, "total_steps": 2050, "loss": 0.4152, "lr": 4.012514814511844e-06, "epoch": 2.9365853658536585, "percentage": 29.37, "elapsed_time": "0:43:05", "remaining_time": "1:43:37"} -{"current_steps": 603, "total_steps": 2050, "loss": 0.5029, "lr": 4.009462549454816e-06, "epoch": 2.9414634146341463, "percentage": 29.41, "elapsed_time": "0:43:09", "remaining_time": "1:43:35"} -{"current_steps": 604, "total_steps": 2050, "loss": 0.4857, "lr": 4.006406739413775e-06, "epoch": 2.946341463414634, "percentage": 29.46, "elapsed_time": "0:43:11", "remaining_time": "1:43:24"} -{"current_steps": 605, "total_steps": 2050, "loss": 0.4449, "lr": 4.003347391565317e-06, "epoch": 2.951219512195122, "percentage": 29.51, "elapsed_time": "0:43:14", "remaining_time": "1:43:17"} -{"current_steps": 606, "total_steps": 2050, "loss": 0.4808, "lr": 4.000284513094342e-06, "epoch": 2.9560975609756097, "percentage": 29.56, "elapsed_time": "0:43:17", "remaining_time": "1:43:10"} -{"current_steps": 607, "total_steps": 2050, "loss": 0.4395, "lr": 3.997218111194042e-06, "epoch": 2.9609756097560975, "percentage": 29.61, "elapsed_time": "0:43:22", "remaining_time": "1:43:06"} -{"current_steps": 608, "total_steps": 2050, "loss": 0.3264, "lr": 3.994148193065886e-06, "epoch": 2.9658536585365853, "percentage": 29.66, "elapsed_time": "0:43:26", "remaining_time": "1:43:02"} -{"current_steps": 609, "total_steps": 2050, "loss": 0.3285, "lr": 3.991074765919598e-06, "epoch": 2.970731707317073, "percentage": 29.71, "elapsed_time": "0:43:34", "remaining_time": "1:43:05"} -{"current_steps": 610, "total_steps": 2050, "loss": 0.3638, "lr": 3.987997836973147e-06, "epoch": 2.975609756097561, "percentage": 29.76, "elapsed_time": "0:43:35", "remaining_time": "1:42:55"} -{"current_steps": 611, "total_steps": 2050, "loss": 0.3853, "lr": 3.984917413452721e-06, "epoch": 2.9804878048780488, "percentage": 29.8, "elapsed_time": "0:43:38", "remaining_time": "1:42:47"} -{"current_steps": 612, "total_steps": 2050, "loss": 0.6411, "lr": 3.981833502592717e-06, "epoch": 2.9853658536585366, "percentage": 29.85, "elapsed_time": "0:43:40", "remaining_time": "1:42:36"} -{"current_steps": 613, "total_steps": 2050, "loss": 0.2759, "lr": 3.978746111635725e-06, "epoch": 2.9902439024390244, "percentage": 29.9, "elapsed_time": "0:43:43", "remaining_time": "1:42:30"} -{"current_steps": 614, "total_steps": 2050, "loss": 0.4566, "lr": 3.9756552478325045e-06, "epoch": 2.995121951219512, "percentage": 29.95, "elapsed_time": "0:43:45", "remaining_time": "1:42:20"} -{"current_steps": 615, "total_steps": 2050, "loss": 0.2221, "lr": 3.972560918441972e-06, "epoch": 3.0, "percentage": 30.0, "elapsed_time": "0:43:48", "remaining_time": "1:42:12"} -{"current_steps": 616, "total_steps": 2050, "loss": 0.2403, "lr": 3.969463130731183e-06, "epoch": 3.004878048780488, "percentage": 30.05, "elapsed_time": "0:48:03", "remaining_time": "1:51:52"} -{"current_steps": 617, "total_steps": 2050, "loss": 0.2635, "lr": 3.966361891975316e-06, "epoch": 3.0097560975609756, "percentage": 30.1, "elapsed_time": "0:48:10", "remaining_time": "1:51:52"} -{"current_steps": 618, "total_steps": 2050, "loss": 0.3294, "lr": 3.963257209457652e-06, "epoch": 3.0146341463414634, "percentage": 30.15, "elapsed_time": "0:48:16", "remaining_time": "1:51:51"} -{"current_steps": 619, "total_steps": 2050, "loss": 0.1338, "lr": 3.960149090469561e-06, "epoch": 3.0195121951219512, "percentage": 30.2, "elapsed_time": "0:48:19", "remaining_time": "1:51:43"} -{"current_steps": 620, "total_steps": 2050, "loss": 0.1469, "lr": 3.957037542310484e-06, "epoch": 3.024390243902439, "percentage": 30.24, "elapsed_time": "0:48:21", "remaining_time": "1:51:31"} -{"current_steps": 621, "total_steps": 2050, "loss": 0.2788, "lr": 3.953922572287915e-06, "epoch": 3.029268292682927, "percentage": 30.29, "elapsed_time": "0:48:24", "remaining_time": "1:51:23"} -{"current_steps": 622, "total_steps": 2050, "loss": 0.4521, "lr": 3.950804187717384e-06, "epoch": 3.0341463414634147, "percentage": 30.34, "elapsed_time": "0:48:27", "remaining_time": "1:51:15"} -{"current_steps": 623, "total_steps": 2050, "loss": 0.5113, "lr": 3.947682395922439e-06, "epoch": 3.0390243902439025, "percentage": 30.39, "elapsed_time": "0:48:33", "remaining_time": "1:51:13"} -{"current_steps": 624, "total_steps": 2050, "loss": 0.0968, "lr": 3.9445572042346346e-06, "epoch": 3.0439024390243903, "percentage": 30.44, "elapsed_time": "0:48:34", "remaining_time": "1:51:01"} -{"current_steps": 625, "total_steps": 2050, "loss": 0.2462, "lr": 3.941428619993505e-06, "epoch": 3.048780487804878, "percentage": 30.49, "elapsed_time": "0:48:38", "remaining_time": "1:50:53"} -{"current_steps": 626, "total_steps": 2050, "loss": 0.1782, "lr": 3.938296650546552e-06, "epoch": 3.053658536585366, "percentage": 30.54, "elapsed_time": "0:48:41", "remaining_time": "1:50:45"} -{"current_steps": 627, "total_steps": 2050, "loss": 0.2955, "lr": 3.935161303249231e-06, "epoch": 3.0585365853658537, "percentage": 30.59, "elapsed_time": "0:48:44", "remaining_time": "1:50:36"} -{"current_steps": 628, "total_steps": 2050, "loss": 0.3259, "lr": 3.932022585464928e-06, "epoch": 3.0634146341463415, "percentage": 30.63, "elapsed_time": "0:48:49", "remaining_time": "1:50:34"} -{"current_steps": 629, "total_steps": 2050, "loss": 0.2306, "lr": 3.928880504564943e-06, "epoch": 3.0682926829268293, "percentage": 30.68, "elapsed_time": "0:48:51", "remaining_time": "1:50:21"} -{"current_steps": 630, "total_steps": 2050, "loss": 0.2197, "lr": 3.92573506792848e-06, "epoch": 3.073170731707317, "percentage": 30.73, "elapsed_time": "0:48:54", "remaining_time": "1:50:13"} -{"current_steps": 631, "total_steps": 2050, "loss": 0.1607, "lr": 3.9225862829426184e-06, "epoch": 3.078048780487805, "percentage": 30.78, "elapsed_time": "0:48:55", "remaining_time": "1:50:02"} -{"current_steps": 632, "total_steps": 2050, "loss": 0.3087, "lr": 3.919434157002303e-06, "epoch": 3.0829268292682928, "percentage": 30.83, "elapsed_time": "0:48:57", "remaining_time": "1:49:51"} -{"current_steps": 633, "total_steps": 2050, "loss": 0.2213, "lr": 3.916278697510325e-06, "epoch": 3.0878048780487806, "percentage": 30.88, "elapsed_time": "0:49:03", "remaining_time": "1:49:49"} -{"current_steps": 634, "total_steps": 2050, "loss": 0.318, "lr": 3.913119911877305e-06, "epoch": 3.0926829268292684, "percentage": 30.93, "elapsed_time": "0:49:06", "remaining_time": "1:49:40"} -{"current_steps": 635, "total_steps": 2050, "loss": 0.1757, "lr": 3.909957807521674e-06, "epoch": 3.097560975609756, "percentage": 30.98, "elapsed_time": "0:49:07", "remaining_time": "1:49:27"} -{"current_steps": 636, "total_steps": 2050, "loss": 0.2391, "lr": 3.906792391869657e-06, "epoch": 3.102439024390244, "percentage": 31.02, "elapsed_time": "0:49:10", "remaining_time": "1:49:20"} -{"current_steps": 637, "total_steps": 2050, "loss": 0.2548, "lr": 3.903623672355258e-06, "epoch": 3.107317073170732, "percentage": 31.07, "elapsed_time": "0:49:16", "remaining_time": "1:49:17"} -{"current_steps": 638, "total_steps": 2050, "loss": 0.2389, "lr": 3.900451656420237e-06, "epoch": 3.1121951219512196, "percentage": 31.12, "elapsed_time": "0:49:21", "remaining_time": "1:49:14"} -{"current_steps": 639, "total_steps": 2050, "loss": 0.1371, "lr": 3.897276351514097e-06, "epoch": 3.1170731707317074, "percentage": 31.17, "elapsed_time": "0:49:23", "remaining_time": "1:49:04"} -{"current_steps": 640, "total_steps": 2050, "loss": 0.3363, "lr": 3.894097765094065e-06, "epoch": 3.1219512195121952, "percentage": 31.22, "elapsed_time": "0:49:26", "remaining_time": "1:48:54"} -{"current_steps": 641, "total_steps": 2050, "loss": 0.1314, "lr": 3.890915904625075e-06, "epoch": 3.126829268292683, "percentage": 31.27, "elapsed_time": "0:49:29", "remaining_time": "1:48:48"} -{"current_steps": 642, "total_steps": 2050, "loss": 0.3563, "lr": 3.887730777579751e-06, "epoch": 3.131707317073171, "percentage": 31.32, "elapsed_time": "0:49:34", "remaining_time": "1:48:44"} -{"current_steps": 643, "total_steps": 2050, "loss": 0.5053, "lr": 3.884542391438387e-06, "epoch": 3.1365853658536587, "percentage": 31.37, "elapsed_time": "0:49:36", "remaining_time": "1:48:33"} -{"current_steps": 644, "total_steps": 2050, "loss": 0.6259, "lr": 3.88135075368893e-06, "epoch": 3.1414634146341465, "percentage": 31.41, "elapsed_time": "0:49:40", "remaining_time": "1:48:27"} -{"current_steps": 645, "total_steps": 2050, "loss": 0.2599, "lr": 3.878155871826968e-06, "epoch": 3.1463414634146343, "percentage": 31.46, "elapsed_time": "0:49:45", "remaining_time": "1:48:23"} -{"current_steps": 646, "total_steps": 2050, "loss": 0.2075, "lr": 3.874957753355701e-06, "epoch": 3.151219512195122, "percentage": 31.51, "elapsed_time": "0:49:49", "remaining_time": "1:48:17"} -{"current_steps": 647, "total_steps": 2050, "loss": 0.4577, "lr": 3.8717564057859365e-06, "epoch": 3.15609756097561, "percentage": 31.56, "elapsed_time": "0:49:56", "remaining_time": "1:48:17"} -{"current_steps": 648, "total_steps": 2050, "loss": 0.4023, "lr": 3.868551836636063e-06, "epoch": 3.1609756097560977, "percentage": 31.61, "elapsed_time": "0:49:59", "remaining_time": "1:48:08"} -{"current_steps": 649, "total_steps": 2050, "loss": 0.1669, "lr": 3.865344053432035e-06, "epoch": 3.1658536585365855, "percentage": 31.66, "elapsed_time": "0:50:00", "remaining_time": "1:47:57"} -{"current_steps": 650, "total_steps": 2050, "loss": 0.2766, "lr": 3.862133063707353e-06, "epoch": 3.1707317073170733, "percentage": 31.71, "elapsed_time": "0:50:04", "remaining_time": "1:47:51"} -{"current_steps": 651, "total_steps": 2050, "loss": 0.1788, "lr": 3.858918875003053e-06, "epoch": 3.175609756097561, "percentage": 31.76, "elapsed_time": "0:50:08", "remaining_time": "1:47:45"} -{"current_steps": 652, "total_steps": 2050, "loss": 0.224, "lr": 3.855701494867679e-06, "epoch": 3.180487804878049, "percentage": 31.8, "elapsed_time": "0:50:11", "remaining_time": "1:47:37"} -{"current_steps": 653, "total_steps": 2050, "loss": 0.4029, "lr": 3.852480930857275e-06, "epoch": 3.1853658536585368, "percentage": 31.85, "elapsed_time": "0:50:18", "remaining_time": "1:47:38"} -{"current_steps": 654, "total_steps": 2050, "loss": 0.2096, "lr": 3.849257190535356e-06, "epoch": 3.1902439024390246, "percentage": 31.9, "elapsed_time": "0:50:21", "remaining_time": "1:47:29"} -{"current_steps": 655, "total_steps": 2050, "loss": 0.5574, "lr": 3.846030281472902e-06, "epoch": 3.1951219512195124, "percentage": 31.95, "elapsed_time": "0:50:26", "remaining_time": "1:47:25"} -{"current_steps": 656, "total_steps": 2050, "loss": 0.2233, "lr": 3.842800211248333e-06, "epoch": 3.2, "percentage": 32.0, "elapsed_time": "0:50:31", "remaining_time": "1:47:22"} -{"current_steps": 657, "total_steps": 2050, "loss": 0.3871, "lr": 3.839566987447492e-06, "epoch": 3.204878048780488, "percentage": 32.05, "elapsed_time": "0:50:33", "remaining_time": "1:47:11"} -{"current_steps": 658, "total_steps": 2050, "loss": 0.4325, "lr": 3.8363306176636296e-06, "epoch": 3.209756097560976, "percentage": 32.1, "elapsed_time": "0:50:39", "remaining_time": "1:47:09"} -{"current_steps": 659, "total_steps": 2050, "loss": 0.5321, "lr": 3.833091109497384e-06, "epoch": 3.2146341463414636, "percentage": 32.15, "elapsed_time": "0:50:43", "remaining_time": "1:47:03"} -{"current_steps": 660, "total_steps": 2050, "loss": 0.1359, "lr": 3.829848470556765e-06, "epoch": 3.2195121951219514, "percentage": 32.2, "elapsed_time": "0:50:45", "remaining_time": "1:46:54"} -{"current_steps": 661, "total_steps": 2050, "loss": 0.3145, "lr": 3.8266027084571335e-06, "epoch": 3.2243902439024392, "percentage": 32.24, "elapsed_time": "0:50:52", "remaining_time": "1:46:53"} -{"current_steps": 662, "total_steps": 2050, "loss": 0.1252, "lr": 3.823353830821187e-06, "epoch": 3.229268292682927, "percentage": 32.29, "elapsed_time": "0:50:54", "remaining_time": "1:46:43"} -{"current_steps": 663, "total_steps": 2050, "loss": 0.2589, "lr": 3.820101845278937e-06, "epoch": 3.234146341463415, "percentage": 32.34, "elapsed_time": "0:50:56", "remaining_time": "1:46:34"} -{"current_steps": 664, "total_steps": 2050, "loss": 0.2594, "lr": 3.816846759467696e-06, "epoch": 3.2390243902439027, "percentage": 32.39, "elapsed_time": "0:50:58", "remaining_time": "1:46:24"} -{"current_steps": 665, "total_steps": 2050, "loss": 0.2998, "lr": 3.8135885810320587e-06, "epoch": 3.2439024390243905, "percentage": 32.44, "elapsed_time": "0:51:02", "remaining_time": "1:46:18"} -{"current_steps": 666, "total_steps": 2050, "loss": 0.2238, "lr": 3.810327317623881e-06, "epoch": 3.2487804878048783, "percentage": 32.49, "elapsed_time": "0:51:05", "remaining_time": "1:46:11"} -{"current_steps": 667, "total_steps": 2050, "loss": 0.3381, "lr": 3.8070629769022628e-06, "epoch": 3.253658536585366, "percentage": 32.54, "elapsed_time": "0:51:09", "remaining_time": "1:46:04"} -{"current_steps": 668, "total_steps": 2050, "loss": 0.2407, "lr": 3.8037955665335335e-06, "epoch": 3.258536585365854, "percentage": 32.59, "elapsed_time": "0:51:15", "remaining_time": "1:46:03"} -{"current_steps": 669, "total_steps": 2050, "loss": 0.2957, "lr": 3.800525094191231e-06, "epoch": 3.2634146341463417, "percentage": 32.63, "elapsed_time": "0:51:18", "remaining_time": "1:45:55"} -{"current_steps": 670, "total_steps": 2050, "loss": 0.2493, "lr": 3.797251567556083e-06, "epoch": 3.2682926829268295, "percentage": 32.68, "elapsed_time": "0:51:25", "remaining_time": "1:45:56"} -{"current_steps": 671, "total_steps": 2050, "loss": 0.1186, "lr": 3.793974994315991e-06, "epoch": 3.2731707317073173, "percentage": 32.73, "elapsed_time": "0:51:27", "remaining_time": "1:45:44"} -{"current_steps": 672, "total_steps": 2050, "loss": 0.3453, "lr": 3.790695382166013e-06, "epoch": 3.278048780487805, "percentage": 32.78, "elapsed_time": "0:51:29", "remaining_time": "1:45:34"} -{"current_steps": 673, "total_steps": 2050, "loss": 0.1981, "lr": 3.7874127388083415e-06, "epoch": 3.2829268292682925, "percentage": 32.83, "elapsed_time": "0:51:31", "remaining_time": "1:45:26"} -{"current_steps": 674, "total_steps": 2050, "loss": 0.2934, "lr": 3.7841270719522895e-06, "epoch": 3.2878048780487803, "percentage": 32.88, "elapsed_time": "0:51:33", "remaining_time": "1:45:15"} -{"current_steps": 675, "total_steps": 2050, "loss": 0.1359, "lr": 3.7808383893142692e-06, "epoch": 3.292682926829268, "percentage": 32.93, "elapsed_time": "0:51:34", "remaining_time": "1:45:03"} -{"current_steps": 676, "total_steps": 2050, "loss": 0.2498, "lr": 3.7775466986177763e-06, "epoch": 3.297560975609756, "percentage": 32.98, "elapsed_time": "0:51:40", "remaining_time": "1:45:00"} -{"current_steps": 677, "total_steps": 2050, "loss": 0.1308, "lr": 3.774252007593371e-06, "epoch": 3.3024390243902437, "percentage": 33.02, "elapsed_time": "0:51:43", "remaining_time": "1:44:54"} -{"current_steps": 678, "total_steps": 2050, "loss": 0.3915, "lr": 3.7709543239786593e-06, "epoch": 3.3073170731707315, "percentage": 33.07, "elapsed_time": "0:51:46", "remaining_time": "1:44:46"} -{"current_steps": 679, "total_steps": 2050, "loss": 0.2558, "lr": 3.767653655518277e-06, "epoch": 3.3121951219512193, "percentage": 33.12, "elapsed_time": "0:51:50", "remaining_time": "1:44:39"} -{"current_steps": 680, "total_steps": 2050, "loss": 0.1988, "lr": 3.7643500099638673e-06, "epoch": 3.317073170731707, "percentage": 33.17, "elapsed_time": "0:51:51", "remaining_time": "1:44:27"} -{"current_steps": 681, "total_steps": 2050, "loss": 0.4908, "lr": 3.7610433950740667e-06, "epoch": 3.321951219512195, "percentage": 33.22, "elapsed_time": "0:51:54", "remaining_time": "1:44:20"} -{"current_steps": 682, "total_steps": 2050, "loss": 0.304, "lr": 3.757733818614485e-06, "epoch": 3.3268292682926828, "percentage": 33.27, "elapsed_time": "0:51:57", "remaining_time": "1:44:14"} -{"current_steps": 683, "total_steps": 2050, "loss": 0.2533, "lr": 3.7544212883576856e-06, "epoch": 3.3317073170731706, "percentage": 33.32, "elapsed_time": "0:52:00", "remaining_time": "1:44:05"} -{"current_steps": 684, "total_steps": 2050, "loss": 0.1771, "lr": 3.751105812083172e-06, "epoch": 3.3365853658536584, "percentage": 33.37, "elapsed_time": "0:52:02", "remaining_time": "1:43:56"} -{"current_steps": 685, "total_steps": 2050, "loss": 0.4213, "lr": 3.7477873975773655e-06, "epoch": 3.341463414634146, "percentage": 33.41, "elapsed_time": "0:52:10", "remaining_time": "1:43:58"} -{"current_steps": 686, "total_steps": 2050, "loss": 0.3808, "lr": 3.7444660526335853e-06, "epoch": 3.346341463414634, "percentage": 33.46, "elapsed_time": "0:52:13", "remaining_time": "1:43:50"} -{"current_steps": 687, "total_steps": 2050, "loss": 0.6438, "lr": 3.741141785052036e-06, "epoch": 3.351219512195122, "percentage": 33.51, "elapsed_time": "0:52:17", "remaining_time": "1:43:44"} -{"current_steps": 688, "total_steps": 2050, "loss": 0.3686, "lr": 3.737814602639784e-06, "epoch": 3.3560975609756096, "percentage": 33.56, "elapsed_time": "0:52:20", "remaining_time": "1:43:36"} -{"current_steps": 689, "total_steps": 2050, "loss": 0.2934, "lr": 3.7344845132107427e-06, "epoch": 3.3609756097560974, "percentage": 33.61, "elapsed_time": "0:52:26", "remaining_time": "1:43:34"} -{"current_steps": 690, "total_steps": 2050, "loss": 0.3299, "lr": 3.731151524585651e-06, "epoch": 3.3658536585365852, "percentage": 33.66, "elapsed_time": "0:52:28", "remaining_time": "1:43:25"} -{"current_steps": 691, "total_steps": 2050, "loss": 0.6303, "lr": 3.7278156445920584e-06, "epoch": 3.370731707317073, "percentage": 33.71, "elapsed_time": "0:52:31", "remaining_time": "1:43:17"} -{"current_steps": 692, "total_steps": 2050, "loss": 0.2432, "lr": 3.724476881064303e-06, "epoch": 3.375609756097561, "percentage": 33.76, "elapsed_time": "0:52:34", "remaining_time": "1:43:10"} -{"current_steps": 693, "total_steps": 2050, "loss": 0.3131, "lr": 3.721135241843496e-06, "epoch": 3.3804878048780487, "percentage": 33.8, "elapsed_time": "0:52:37", "remaining_time": "1:43:03"} -{"current_steps": 694, "total_steps": 2050, "loss": 0.3372, "lr": 3.7177907347775016e-06, "epoch": 3.3853658536585365, "percentage": 33.85, "elapsed_time": "0:52:39", "remaining_time": "1:42:53"} -{"current_steps": 695, "total_steps": 2050, "loss": 0.5055, "lr": 3.71444336772092e-06, "epoch": 3.3902439024390243, "percentage": 33.9, "elapsed_time": "0:52:43", "remaining_time": "1:42:48"} -{"current_steps": 696, "total_steps": 2050, "loss": 0.6183, "lr": 3.711093148535068e-06, "epoch": 3.395121951219512, "percentage": 33.95, "elapsed_time": "0:52:46", "remaining_time": "1:42:39"} -{"current_steps": 697, "total_steps": 2050, "loss": 0.1568, "lr": 3.707740085087959e-06, "epoch": 3.4, "percentage": 34.0, "elapsed_time": "0:52:47", "remaining_time": "1:42:29"} -{"current_steps": 698, "total_steps": 2050, "loss": 0.2826, "lr": 3.7043841852542884e-06, "epoch": 3.4048780487804877, "percentage": 34.05, "elapsed_time": "0:52:50", "remaining_time": "1:42:20"} -{"current_steps": 699, "total_steps": 2050, "loss": 0.1918, "lr": 3.701025456915411e-06, "epoch": 3.4097560975609755, "percentage": 34.1, "elapsed_time": "0:52:52", "remaining_time": "1:42:12"} -{"current_steps": 700, "total_steps": 2050, "loss": 0.2493, "lr": 3.697663907959327e-06, "epoch": 3.4146341463414633, "percentage": 34.15, "elapsed_time": "0:52:57", "remaining_time": "1:42:08"} -{"current_steps": 701, "total_steps": 2050, "loss": 0.4913, "lr": 3.6942995462806574e-06, "epoch": 3.419512195121951, "percentage": 34.2, "elapsed_time": "0:53:00", "remaining_time": "1:42:00"} -{"current_steps": 702, "total_steps": 2050, "loss": 0.1788, "lr": 3.6909323797806314e-06, "epoch": 3.424390243902439, "percentage": 34.24, "elapsed_time": "0:53:03", "remaining_time": "1:41:54"} -{"current_steps": 703, "total_steps": 2050, "loss": 0.4162, "lr": 3.6875624163670635e-06, "epoch": 3.4292682926829268, "percentage": 34.29, "elapsed_time": "0:53:06", "remaining_time": "1:41:45"} -{"current_steps": 704, "total_steps": 2050, "loss": 0.1924, "lr": 3.6841896639543394e-06, "epoch": 3.4341463414634146, "percentage": 34.34, "elapsed_time": "0:53:07", "remaining_time": "1:41:34"} -{"current_steps": 705, "total_steps": 2050, "loss": 0.3177, "lr": 3.6808141304633924e-06, "epoch": 3.4390243902439024, "percentage": 34.39, "elapsed_time": "0:53:11", "remaining_time": "1:41:28"} -{"current_steps": 706, "total_steps": 2050, "loss": 0.2301, "lr": 3.6774358238216878e-06, "epoch": 3.44390243902439, "percentage": 34.44, "elapsed_time": "0:53:15", "remaining_time": "1:41:23"} -{"current_steps": 707, "total_steps": 2050, "loss": 0.1894, "lr": 3.6740547519632048e-06, "epoch": 3.448780487804878, "percentage": 34.49, "elapsed_time": "0:53:16", "remaining_time": "1:41:12"} -{"current_steps": 708, "total_steps": 2050, "loss": 0.2642, "lr": 3.670670922828414e-06, "epoch": 3.453658536585366, "percentage": 34.54, "elapsed_time": "0:53:23", "remaining_time": "1:41:11"} -{"current_steps": 709, "total_steps": 2050, "loss": 0.2275, "lr": 3.667284344364264e-06, "epoch": 3.4585365853658536, "percentage": 34.59, "elapsed_time": "0:53:24", "remaining_time": "1:41:01"} -{"current_steps": 710, "total_steps": 2050, "loss": 0.4447, "lr": 3.6638950245241604e-06, "epoch": 3.4634146341463414, "percentage": 34.63, "elapsed_time": "0:53:31", "remaining_time": "1:41:00"} -{"current_steps": 711, "total_steps": 2050, "loss": 0.2415, "lr": 3.660502971267945e-06, "epoch": 3.4682926829268292, "percentage": 34.68, "elapsed_time": "0:53:32", "remaining_time": "1:40:49"} -{"current_steps": 712, "total_steps": 2050, "loss": 0.0921, "lr": 3.65710819256188e-06, "epoch": 3.473170731707317, "percentage": 34.73, "elapsed_time": "0:53:33", "remaining_time": "1:40:38"} -{"current_steps": 713, "total_steps": 2050, "loss": 0.2371, "lr": 3.65371069637863e-06, "epoch": 3.478048780487805, "percentage": 34.78, "elapsed_time": "0:53:36", "remaining_time": "1:40:31"} -{"current_steps": 714, "total_steps": 2050, "loss": 0.4026, "lr": 3.650310490697238e-06, "epoch": 3.4829268292682927, "percentage": 34.83, "elapsed_time": "0:53:40", "remaining_time": "1:40:26"} -{"current_steps": 715, "total_steps": 2050, "loss": 0.4312, "lr": 3.646907583503114e-06, "epoch": 3.4878048780487805, "percentage": 34.88, "elapsed_time": "0:53:46", "remaining_time": "1:40:23"} -{"current_steps": 716, "total_steps": 2050, "loss": 0.2309, "lr": 3.6435019827880093e-06, "epoch": 3.4926829268292683, "percentage": 34.93, "elapsed_time": "0:53:50", "remaining_time": "1:40:19"} -{"current_steps": 717, "total_steps": 2050, "loss": 0.296, "lr": 3.640093696550003e-06, "epoch": 3.497560975609756, "percentage": 34.98, "elapsed_time": "0:53:53", "remaining_time": "1:40:12"} -{"current_steps": 718, "total_steps": 2050, "loss": 0.2723, "lr": 3.6366827327934817e-06, "epoch": 3.502439024390244, "percentage": 35.02, "elapsed_time": "0:54:00", "remaining_time": "1:40:11"} -{"current_steps": 719, "total_steps": 2050, "loss": 0.3797, "lr": 3.6332690995291176e-06, "epoch": 3.5073170731707317, "percentage": 35.07, "elapsed_time": "0:54:03", "remaining_time": "1:40:04"} -{"current_steps": 720, "total_steps": 2050, "loss": 0.9868, "lr": 3.6298528047738545e-06, "epoch": 3.5121951219512195, "percentage": 35.12, "elapsed_time": "0:54:06", "remaining_time": "1:39:57"} -{"current_steps": 721, "total_steps": 2050, "loss": 0.4069, "lr": 3.626433856550886e-06, "epoch": 3.5170731707317073, "percentage": 35.17, "elapsed_time": "0:54:12", "remaining_time": "1:39:55"} -{"current_steps": 722, "total_steps": 2050, "loss": 0.3368, "lr": 3.623012262889637e-06, "epoch": 3.521951219512195, "percentage": 35.22, "elapsed_time": "0:54:17", "remaining_time": "1:39:52"} -{"current_steps": 723, "total_steps": 2050, "loss": 0.3972, "lr": 3.6195880318257465e-06, "epoch": 3.526829268292683, "percentage": 35.27, "elapsed_time": "0:54:20", "remaining_time": "1:39:43"} -{"current_steps": 724, "total_steps": 2050, "loss": 0.52, "lr": 3.616161171401046e-06, "epoch": 3.5317073170731708, "percentage": 35.32, "elapsed_time": "0:54:21", "remaining_time": "1:39:33"} -{"current_steps": 725, "total_steps": 2050, "loss": 0.23, "lr": 3.612731689663542e-06, "epoch": 3.5365853658536586, "percentage": 35.37, "elapsed_time": "0:54:25", "remaining_time": "1:39:28"} -{"current_steps": 726, "total_steps": 2050, "loss": 0.4151, "lr": 3.6092995946673996e-06, "epoch": 3.5414634146341464, "percentage": 35.41, "elapsed_time": "0:54:29", "remaining_time": "1:39:22"} -{"current_steps": 727, "total_steps": 2050, "loss": 0.2798, "lr": 3.605864894472918e-06, "epoch": 3.546341463414634, "percentage": 35.46, "elapsed_time": "0:54:34", "remaining_time": "1:39:19"} -{"current_steps": 728, "total_steps": 2050, "loss": 0.4336, "lr": 3.602427597146516e-06, "epoch": 3.551219512195122, "percentage": 35.51, "elapsed_time": "0:54:36", "remaining_time": "1:39:10"} -{"current_steps": 729, "total_steps": 2050, "loss": 0.4803, "lr": 3.5989877107607134e-06, "epoch": 3.55609756097561, "percentage": 35.56, "elapsed_time": "0:54:40", "remaining_time": "1:39:04"} -{"current_steps": 730, "total_steps": 2050, "loss": 0.3698, "lr": 3.5955452433941075e-06, "epoch": 3.5609756097560976, "percentage": 35.61, "elapsed_time": "0:54:42", "remaining_time": "1:38:54"} -{"current_steps": 731, "total_steps": 2050, "loss": 0.2373, "lr": 3.5921002031313586e-06, "epoch": 3.5658536585365854, "percentage": 35.66, "elapsed_time": "0:54:45", "remaining_time": "1:38:47"} -{"current_steps": 732, "total_steps": 2050, "loss": 0.1908, "lr": 3.58865259806317e-06, "epoch": 3.5707317073170732, "percentage": 35.71, "elapsed_time": "0:54:48", "remaining_time": "1:38:41"} -{"current_steps": 733, "total_steps": 2050, "loss": 0.3993, "lr": 3.585202436286267e-06, "epoch": 3.575609756097561, "percentage": 35.76, "elapsed_time": "0:54:51", "remaining_time": "1:38:33"} -{"current_steps": 734, "total_steps": 2050, "loss": 0.4237, "lr": 3.581749725903381e-06, "epoch": 3.580487804878049, "percentage": 35.8, "elapsed_time": "0:54:54", "remaining_time": "1:38:26"} -{"current_steps": 735, "total_steps": 2050, "loss": 0.3011, "lr": 3.5782944750232274e-06, "epoch": 3.5853658536585367, "percentage": 35.85, "elapsed_time": "0:54:56", "remaining_time": "1:38:18"} -{"current_steps": 736, "total_steps": 2050, "loss": 0.0896, "lr": 3.574836691760489e-06, "epoch": 3.5902439024390245, "percentage": 35.9, "elapsed_time": "0:54:58", "remaining_time": "1:38:08"} -{"current_steps": 737, "total_steps": 2050, "loss": 0.2751, "lr": 3.571376384235795e-06, "epoch": 3.5951219512195123, "percentage": 35.95, "elapsed_time": "0:54:59", "remaining_time": "1:37:58"} -{"current_steps": 738, "total_steps": 2050, "loss": 0.2086, "lr": 3.5679135605757035e-06, "epoch": 3.6, "percentage": 36.0, "elapsed_time": "0:55:01", "remaining_time": "1:37:49"} -{"current_steps": 739, "total_steps": 2050, "loss": 0.1659, "lr": 3.564448228912682e-06, "epoch": 3.604878048780488, "percentage": 36.05, "elapsed_time": "0:55:04", "remaining_time": "1:37:43"} -{"current_steps": 740, "total_steps": 2050, "loss": 0.2469, "lr": 3.5609803973850877e-06, "epoch": 3.6097560975609757, "percentage": 36.1, "elapsed_time": "0:55:07", "remaining_time": "1:37:34"} -{"current_steps": 741, "total_steps": 2050, "loss": 0.375, "lr": 3.557510074137147e-06, "epoch": 3.6146341463414635, "percentage": 36.15, "elapsed_time": "0:55:10", "remaining_time": "1:37:28"} -{"current_steps": 742, "total_steps": 2050, "loss": 0.3133, "lr": 3.554037267318942e-06, "epoch": 3.6195121951219513, "percentage": 36.2, "elapsed_time": "0:55:16", "remaining_time": "1:37:26"} -{"current_steps": 743, "total_steps": 2050, "loss": 0.2243, "lr": 3.5505619850863847e-06, "epoch": 3.624390243902439, "percentage": 36.24, "elapsed_time": "0:55:21", "remaining_time": "1:37:21"} -{"current_steps": 744, "total_steps": 2050, "loss": 0.1321, "lr": 3.5470842356012007e-06, "epoch": 3.629268292682927, "percentage": 36.29, "elapsed_time": "0:55:22", "remaining_time": "1:37:12"} -{"current_steps": 745, "total_steps": 2050, "loss": 0.361, "lr": 3.5436040270309113e-06, "epoch": 3.6341463414634148, "percentage": 36.34, "elapsed_time": "0:55:24", "remaining_time": "1:37:03"} -{"current_steps": 746, "total_steps": 2050, "loss": 0.1523, "lr": 3.540121367548811e-06, "epoch": 3.6390243902439026, "percentage": 36.39, "elapsed_time": "0:55:28", "remaining_time": "1:36:57"} -{"current_steps": 747, "total_steps": 2050, "loss": 0.4898, "lr": 3.5366362653339524e-06, "epoch": 3.6439024390243904, "percentage": 36.44, "elapsed_time": "0:55:30", "remaining_time": "1:36:49"} -{"current_steps": 748, "total_steps": 2050, "loss": 0.1397, "lr": 3.533148728571124e-06, "epoch": 3.648780487804878, "percentage": 36.49, "elapsed_time": "0:55:31", "remaining_time": "1:36:39"} -{"current_steps": 749, "total_steps": 2050, "loss": 0.323, "lr": 3.5296587654508317e-06, "epoch": 3.653658536585366, "percentage": 36.54, "elapsed_time": "0:55:33", "remaining_time": "1:36:30"} -{"current_steps": 750, "total_steps": 2050, "loss": 0.5577, "lr": 3.526166384169279e-06, "epoch": 3.658536585365854, "percentage": 36.59, "elapsed_time": "0:55:36", "remaining_time": "1:36:23"} -{"current_steps": 751, "total_steps": 2050, "loss": 0.245, "lr": 3.5226715929283507e-06, "epoch": 3.6634146341463416, "percentage": 36.63, "elapsed_time": "0:55:39", "remaining_time": "1:36:16"} -{"current_steps": 752, "total_steps": 2050, "loss": 0.1619, "lr": 3.519174399935588e-06, "epoch": 3.6682926829268294, "percentage": 36.68, "elapsed_time": "0:55:45", "remaining_time": "1:36:13"} -{"current_steps": 753, "total_steps": 2050, "loss": 0.1047, "lr": 3.5156748134041767e-06, "epoch": 3.6731707317073172, "percentage": 36.73, "elapsed_time": "0:55:46", "remaining_time": "1:36:04"} -{"current_steps": 754, "total_steps": 2050, "loss": 0.5713, "lr": 3.5121728415529203e-06, "epoch": 3.678048780487805, "percentage": 36.78, "elapsed_time": "0:55:52", "remaining_time": "1:36:02"} -{"current_steps": 755, "total_steps": 2050, "loss": 0.2174, "lr": 3.5086684926062266e-06, "epoch": 3.682926829268293, "percentage": 36.83, "elapsed_time": "0:55:54", "remaining_time": "1:35:53"} -{"current_steps": 756, "total_steps": 2050, "loss": 0.285, "lr": 3.505161774794085e-06, "epoch": 3.68780487804878, "percentage": 36.88, "elapsed_time": "0:55:56", "remaining_time": "1:35:45"} -{"current_steps": 757, "total_steps": 2050, "loss": 0.1602, "lr": 3.5016526963520474e-06, "epoch": 3.692682926829268, "percentage": 36.93, "elapsed_time": "0:56:00", "remaining_time": "1:35:39"} -{"current_steps": 758, "total_steps": 2050, "loss": 0.666, "lr": 3.498141265521212e-06, "epoch": 3.697560975609756, "percentage": 36.98, "elapsed_time": "0:56:04", "remaining_time": "1:35:34"} -{"current_steps": 759, "total_steps": 2050, "loss": 0.2024, "lr": 3.4946274905481997e-06, "epoch": 3.7024390243902436, "percentage": 37.02, "elapsed_time": "0:56:09", "remaining_time": "1:35:31"} -{"current_steps": 760, "total_steps": 2050, "loss": 0.2719, "lr": 3.4911113796851364e-06, "epoch": 3.7073170731707314, "percentage": 37.07, "elapsed_time": "0:56:11", "remaining_time": "1:35:22"} -{"current_steps": 761, "total_steps": 2050, "loss": 0.1537, "lr": 3.487592941189636e-06, "epoch": 3.7121951219512193, "percentage": 37.12, "elapsed_time": "0:56:13", "remaining_time": "1:35:13"} -{"current_steps": 762, "total_steps": 2050, "loss": 0.6149, "lr": 3.484072183324776e-06, "epoch": 3.717073170731707, "percentage": 37.17, "elapsed_time": "0:56:15", "remaining_time": "1:35:04"} -{"current_steps": 763, "total_steps": 2050, "loss": 0.4241, "lr": 3.4805491143590823e-06, "epoch": 3.721951219512195, "percentage": 37.22, "elapsed_time": "0:56:18", "remaining_time": "1:34:58"} -{"current_steps": 764, "total_steps": 2050, "loss": 0.3037, "lr": 3.4770237425665103e-06, "epoch": 3.7268292682926827, "percentage": 37.27, "elapsed_time": "0:56:22", "remaining_time": "1:34:54"} -{"current_steps": 765, "total_steps": 2050, "loss": 0.4854, "lr": 3.4734960762264204e-06, "epoch": 3.7317073170731705, "percentage": 37.32, "elapsed_time": "0:56:26", "remaining_time": "1:34:47"} -{"current_steps": 766, "total_steps": 2050, "loss": 0.3849, "lr": 3.469966123623563e-06, "epoch": 3.7365853658536583, "percentage": 37.37, "elapsed_time": "0:56:28", "remaining_time": "1:34:39"} -{"current_steps": 767, "total_steps": 2050, "loss": 0.3159, "lr": 3.46643389304806e-06, "epoch": 3.741463414634146, "percentage": 37.41, "elapsed_time": "0:56:30", "remaining_time": "1:34:31"} -{"current_steps": 768, "total_steps": 2050, "loss": 0.7527, "lr": 3.4628993927953786e-06, "epoch": 3.746341463414634, "percentage": 37.46, "elapsed_time": "0:56:32", "remaining_time": "1:34:23"} -{"current_steps": 769, "total_steps": 2050, "loss": 0.1716, "lr": 3.45936263116632e-06, "epoch": 3.7512195121951217, "percentage": 37.51, "elapsed_time": "0:56:38", "remaining_time": "1:34:20"} -{"current_steps": 770, "total_steps": 2050, "loss": 0.2061, "lr": 3.4558236164669957e-06, "epoch": 3.7560975609756095, "percentage": 37.56, "elapsed_time": "0:56:44", "remaining_time": "1:34:19"} -{"current_steps": 771, "total_steps": 2050, "loss": 0.1338, "lr": 3.4522823570088073e-06, "epoch": 3.7609756097560973, "percentage": 37.61, "elapsed_time": "0:56:46", "remaining_time": "1:34:11"} -{"current_steps": 772, "total_steps": 2050, "loss": 0.2615, "lr": 3.4487388611084295e-06, "epoch": 3.765853658536585, "percentage": 37.66, "elapsed_time": "0:56:48", "remaining_time": "1:34:02"} -{"current_steps": 773, "total_steps": 2050, "loss": 0.1401, "lr": 3.445193137087788e-06, "epoch": 3.770731707317073, "percentage": 37.71, "elapsed_time": "0:56:50", "remaining_time": "1:33:53"} -{"current_steps": 774, "total_steps": 2050, "loss": 0.2934, "lr": 3.4416451932740424e-06, "epoch": 3.7756097560975608, "percentage": 37.76, "elapsed_time": "0:56:53", "remaining_time": "1:33:47"} -{"current_steps": 775, "total_steps": 2050, "loss": 0.4579, "lr": 3.4380950379995652e-06, "epoch": 3.7804878048780486, "percentage": 37.8, "elapsed_time": "0:56:55", "remaining_time": "1:33:38"} -{"current_steps": 776, "total_steps": 2050, "loss": 0.2979, "lr": 3.434542679601922e-06, "epoch": 3.7853658536585364, "percentage": 37.85, "elapsed_time": "0:57:00", "remaining_time": "1:33:34"} -{"current_steps": 777, "total_steps": 2050, "loss": 0.1196, "lr": 3.4309881264238538e-06, "epoch": 3.790243902439024, "percentage": 37.9, "elapsed_time": "0:57:02", "remaining_time": "1:33:26"} -{"current_steps": 778, "total_steps": 2050, "loss": 0.2026, "lr": 3.4274313868132547e-06, "epoch": 3.795121951219512, "percentage": 37.95, "elapsed_time": "0:57:03", "remaining_time": "1:33:17"} -{"current_steps": 779, "total_steps": 2050, "loss": 0.2135, "lr": 3.4238724691231534e-06, "epoch": 3.8, "percentage": 38.0, "elapsed_time": "0:57:08", "remaining_time": "1:33:13"} -{"current_steps": 780, "total_steps": 2050, "loss": 0.4418, "lr": 3.4203113817116955e-06, "epoch": 3.8048780487804876, "percentage": 38.05, "elapsed_time": "0:57:14", "remaining_time": "1:33:12"} -{"current_steps": 781, "total_steps": 2050, "loss": 0.203, "lr": 3.4167481329421204e-06, "epoch": 3.8097560975609754, "percentage": 38.1, "elapsed_time": "0:57:17", "remaining_time": "1:33:05"} -{"current_steps": 782, "total_steps": 2050, "loss": 0.3225, "lr": 3.4131827311827447e-06, "epoch": 3.8146341463414632, "percentage": 38.15, "elapsed_time": "0:57:20", "remaining_time": "1:32:59"} -{"current_steps": 783, "total_steps": 2050, "loss": 0.1704, "lr": 3.4096151848069416e-06, "epoch": 3.819512195121951, "percentage": 38.2, "elapsed_time": "0:57:26", "remaining_time": "1:32:56"} -{"current_steps": 784, "total_steps": 2050, "loss": 0.2785, "lr": 3.4060455021931195e-06, "epoch": 3.824390243902439, "percentage": 38.24, "elapsed_time": "0:57:33", "remaining_time": "1:32:57"} -{"current_steps": 785, "total_steps": 2050, "loss": 0.223, "lr": 3.402473691724704e-06, "epoch": 3.8292682926829267, "percentage": 38.29, "elapsed_time": "0:57:37", "remaining_time": "1:32:51"} -{"current_steps": 786, "total_steps": 2050, "loss": 0.2368, "lr": 3.39889976179012e-06, "epoch": 3.8341463414634145, "percentage": 38.34, "elapsed_time": "0:57:44", "remaining_time": "1:32:50"} -{"current_steps": 787, "total_steps": 2050, "loss": 0.3294, "lr": 3.3953237207827673e-06, "epoch": 3.8390243902439023, "percentage": 38.39, "elapsed_time": "0:57:51", "remaining_time": "1:32:50"} -{"current_steps": 788, "total_steps": 2050, "loss": 0.5431, "lr": 3.391745577101005e-06, "epoch": 3.84390243902439, "percentage": 38.44, "elapsed_time": "0:57:53", "remaining_time": "1:32:42"} -{"current_steps": 789, "total_steps": 2050, "loss": 0.2546, "lr": 3.3881653391481306e-06, "epoch": 3.848780487804878, "percentage": 38.49, "elapsed_time": "0:57:54", "remaining_time": "1:32:33"} -{"current_steps": 790, "total_steps": 2050, "loss": 0.3293, "lr": 3.384583015332359e-06, "epoch": 3.8536585365853657, "percentage": 38.54, "elapsed_time": "0:57:57", "remaining_time": "1:32:26"} -{"current_steps": 791, "total_steps": 2050, "loss": 0.1861, "lr": 3.380998614066805e-06, "epoch": 3.8585365853658535, "percentage": 38.59, "elapsed_time": "0:58:00", "remaining_time": "1:32:19"} -{"current_steps": 792, "total_steps": 2050, "loss": 0.2498, "lr": 3.3774121437694606e-06, "epoch": 3.8634146341463413, "percentage": 38.63, "elapsed_time": "0:58:02", "remaining_time": "1:32:11"} -{"current_steps": 793, "total_steps": 2050, "loss": 0.1525, "lr": 3.3738236128631786e-06, "epoch": 3.868292682926829, "percentage": 38.68, "elapsed_time": "0:58:04", "remaining_time": "1:32:03"} -{"current_steps": 794, "total_steps": 2050, "loss": 0.3622, "lr": 3.3702330297756503e-06, "epoch": 3.873170731707317, "percentage": 38.73, "elapsed_time": "0:58:07", "remaining_time": "1:31:57"} -{"current_steps": 795, "total_steps": 2050, "loss": 0.1051, "lr": 3.366640402939387e-06, "epoch": 3.8780487804878048, "percentage": 38.78, "elapsed_time": "0:58:09", "remaining_time": "1:31:47"} -{"current_steps": 796, "total_steps": 2050, "loss": 0.4606, "lr": 3.363045740791698e-06, "epoch": 3.8829268292682926, "percentage": 38.83, "elapsed_time": "0:58:15", "remaining_time": "1:31:46"} -{"current_steps": 797, "total_steps": 2050, "loss": 0.2267, "lr": 3.3594490517746774e-06, "epoch": 3.8878048780487804, "percentage": 38.88, "elapsed_time": "0:58:20", "remaining_time": "1:31:42"} -{"current_steps": 798, "total_steps": 2050, "loss": 0.2792, "lr": 3.3558503443351733e-06, "epoch": 3.892682926829268, "percentage": 38.93, "elapsed_time": "0:58:25", "remaining_time": "1:31:39"} -{"current_steps": 799, "total_steps": 2050, "loss": 0.2579, "lr": 3.352249626924777e-06, "epoch": 3.897560975609756, "percentage": 38.98, "elapsed_time": "0:58:29", "remaining_time": "1:31:34"} -{"current_steps": 800, "total_steps": 2050, "loss": 0.6983, "lr": 3.348646907999801e-06, "epoch": 3.902439024390244, "percentage": 39.02, "elapsed_time": "0:58:31", "remaining_time": "1:31:27"} -{"current_steps": 801, "total_steps": 2050, "loss": 0.3265, "lr": 3.345042196021257e-06, "epoch": 3.9073170731707316, "percentage": 39.07, "elapsed_time": "0:58:34", "remaining_time": "1:31:19"} -{"current_steps": 802, "total_steps": 2050, "loss": 0.497, "lr": 3.3414354994548385e-06, "epoch": 3.9121951219512194, "percentage": 39.12, "elapsed_time": "0:58:36", "remaining_time": "1:31:11"} -{"current_steps": 803, "total_steps": 2050, "loss": 0.2812, "lr": 3.337826826770898e-06, "epoch": 3.9170731707317072, "percentage": 39.17, "elapsed_time": "0:58:41", "remaining_time": "1:31:08"} -{"current_steps": 804, "total_steps": 2050, "loss": 0.2277, "lr": 3.3342161864444312e-06, "epoch": 3.921951219512195, "percentage": 39.22, "elapsed_time": "0:58:43", "remaining_time": "1:31:00"} -{"current_steps": 805, "total_steps": 2050, "loss": 0.1614, "lr": 3.3306035869550534e-06, "epoch": 3.926829268292683, "percentage": 39.27, "elapsed_time": "0:58:47", "remaining_time": "1:30:55"} -{"current_steps": 806, "total_steps": 2050, "loss": 0.3269, "lr": 3.326989036786981e-06, "epoch": 3.9317073170731707, "percentage": 39.32, "elapsed_time": "0:58:50", "remaining_time": "1:30:49"} -{"current_steps": 807, "total_steps": 2050, "loss": 0.2619, "lr": 3.3233725444290126e-06, "epoch": 3.9365853658536585, "percentage": 39.37, "elapsed_time": "0:58:52", "remaining_time": "1:30:41"} -{"current_steps": 808, "total_steps": 2050, "loss": 0.4334, "lr": 3.3197541183745065e-06, "epoch": 3.9414634146341463, "percentage": 39.41, "elapsed_time": "0:58:56", "remaining_time": "1:30:36"} -{"current_steps": 809, "total_steps": 2050, "loss": 0.2738, "lr": 3.3161337671213634e-06, "epoch": 3.946341463414634, "percentage": 39.46, "elapsed_time": "0:58:58", "remaining_time": "1:30:27"} -{"current_steps": 810, "total_steps": 2050, "loss": 0.1597, "lr": 3.312511499172006e-06, "epoch": 3.951219512195122, "percentage": 39.51, "elapsed_time": "0:59:01", "remaining_time": "1:30:21"} -{"current_steps": 811, "total_steps": 2050, "loss": 0.3195, "lr": 3.3088873230333562e-06, "epoch": 3.9560975609756097, "percentage": 39.56, "elapsed_time": "0:59:03", "remaining_time": "1:30:13"} -{"current_steps": 812, "total_steps": 2050, "loss": 0.1865, "lr": 3.3052612472168193e-06, "epoch": 3.9609756097560975, "percentage": 39.61, "elapsed_time": "0:59:06", "remaining_time": "1:30:07"} -{"current_steps": 813, "total_steps": 2050, "loss": 0.3108, "lr": 3.3016332802382618e-06, "epoch": 3.9658536585365853, "percentage": 39.66, "elapsed_time": "0:59:12", "remaining_time": "1:30:05"} -{"current_steps": 814, "total_steps": 2050, "loss": 0.2099, "lr": 3.2980034306179897e-06, "epoch": 3.970731707317073, "percentage": 39.71, "elapsed_time": "0:59:16", "remaining_time": "1:30:00"} -{"current_steps": 815, "total_steps": 2050, "loss": 0.3073, "lr": 3.294371706880733e-06, "epoch": 3.975609756097561, "percentage": 39.76, "elapsed_time": "0:59:21", "remaining_time": "1:29:57"} -{"current_steps": 816, "total_steps": 2050, "loss": 0.2024, "lr": 3.290738117555622e-06, "epoch": 3.9804878048780488, "percentage": 39.8, "elapsed_time": "0:59:24", "remaining_time": "1:29:51"} -{"current_steps": 817, "total_steps": 2050, "loss": 0.508, "lr": 3.2871026711761666e-06, "epoch": 3.9853658536585366, "percentage": 39.85, "elapsed_time": "0:59:27", "remaining_time": "1:29:43"} -{"current_steps": 818, "total_steps": 2050, "loss": 0.2116, "lr": 3.2834653762802414e-06, "epoch": 3.9902439024390244, "percentage": 39.9, "elapsed_time": "0:59:31", "remaining_time": "1:29:38"} -{"current_steps": 819, "total_steps": 2050, "loss": 0.2177, "lr": 3.2798262414100594e-06, "epoch": 3.995121951219512, "percentage": 39.95, "elapsed_time": "0:59:32", "remaining_time": "1:29:30"} -{"current_steps": 820, "total_steps": 2050, "loss": 0.1737, "lr": 3.2761852751121566e-06, "epoch": 4.0, "percentage": 40.0, "elapsed_time": "0:59:37", "remaining_time": "1:29:26"} -{"current_steps": 821, "total_steps": 2050, "loss": 0.2569, "lr": 3.272542485937369e-06, "epoch": 4.004878048780488, "percentage": 40.05, "elapsed_time": "1:03:30", "remaining_time": "1:35:03"} -{"current_steps": 822, "total_steps": 2050, "loss": 0.1621, "lr": 3.2688978824408136e-06, "epoch": 4.009756097560976, "percentage": 40.1, "elapsed_time": "1:03:36", "remaining_time": "1:35:01"} -{"current_steps": 823, "total_steps": 2050, "loss": 0.1121, "lr": 3.2652514731818698e-06, "epoch": 4.014634146341463, "percentage": 40.15, "elapsed_time": "1:03:38", "remaining_time": "1:34:53"} -{"current_steps": 824, "total_steps": 2050, "loss": 0.0835, "lr": 3.2616032667241564e-06, "epoch": 4.019512195121951, "percentage": 40.2, "elapsed_time": "1:03:40", "remaining_time": "1:34:44"} -{"current_steps": 825, "total_steps": 2050, "loss": 0.3731, "lr": 3.257953271635513e-06, "epoch": 4.024390243902439, "percentage": 40.24, "elapsed_time": "1:03:45", "remaining_time": "1:34:39"} -{"current_steps": 826, "total_steps": 2050, "loss": 0.1051, "lr": 3.2543014964879814e-06, "epoch": 4.029268292682927, "percentage": 40.29, "elapsed_time": "1:03:48", "remaining_time": "1:34:33"} -{"current_steps": 827, "total_steps": 2050, "loss": 0.0916, "lr": 3.250647949857781e-06, "epoch": 4.034146341463415, "percentage": 40.34, "elapsed_time": "1:03:52", "remaining_time": "1:34:27"} -{"current_steps": 828, "total_steps": 2050, "loss": 0.4037, "lr": 3.2469926403252932e-06, "epoch": 4.0390243902439025, "percentage": 40.39, "elapsed_time": "1:03:54", "remaining_time": "1:34:19"} -{"current_steps": 829, "total_steps": 2050, "loss": 0.0523, "lr": 3.2433355764750417e-06, "epoch": 4.04390243902439, "percentage": 40.44, "elapsed_time": "1:03:58", "remaining_time": "1:34:12"} -{"current_steps": 830, "total_steps": 2050, "loss": 0.2616, "lr": 3.2396767668956656e-06, "epoch": 4.048780487804878, "percentage": 40.49, "elapsed_time": "1:04:01", "remaining_time": "1:34:05"} -{"current_steps": 831, "total_steps": 2050, "loss": 0.195, "lr": 3.2360162201799085e-06, "epoch": 4.053658536585366, "percentage": 40.54, "elapsed_time": "1:04:05", "remaining_time": "1:34:01"} -{"current_steps": 832, "total_steps": 2050, "loss": 0.1245, "lr": 3.2323539449245906e-06, "epoch": 4.058536585365854, "percentage": 40.59, "elapsed_time": "1:04:11", "remaining_time": "1:33:57"} -{"current_steps": 833, "total_steps": 2050, "loss": 0.1147, "lr": 3.2286899497305917e-06, "epoch": 4.0634146341463415, "percentage": 40.63, "elapsed_time": "1:04:13", "remaining_time": "1:33:49"} -{"current_steps": 834, "total_steps": 2050, "loss": 0.2189, "lr": 3.2250242432028335e-06, "epoch": 4.068292682926829, "percentage": 40.68, "elapsed_time": "1:04:17", "remaining_time": "1:33:43"} -{"current_steps": 835, "total_steps": 2050, "loss": 0.4685, "lr": 3.221356833950254e-06, "epoch": 4.073170731707317, "percentage": 40.73, "elapsed_time": "1:04:20", "remaining_time": "1:33:36"} -{"current_steps": 836, "total_steps": 2050, "loss": 0.1245, "lr": 3.21768773058579e-06, "epoch": 4.078048780487805, "percentage": 40.78, "elapsed_time": "1:04:23", "remaining_time": "1:33:30"} -{"current_steps": 837, "total_steps": 2050, "loss": 0.1342, "lr": 3.21401694172636e-06, "epoch": 4.082926829268293, "percentage": 40.83, "elapsed_time": "1:04:25", "remaining_time": "1:33:21"} -{"current_steps": 838, "total_steps": 2050, "loss": 0.0484, "lr": 3.2103444759928383e-06, "epoch": 4.087804878048781, "percentage": 40.88, "elapsed_time": "1:04:28", "remaining_time": "1:33:15"} -{"current_steps": 839, "total_steps": 2050, "loss": 0.0592, "lr": 3.2066703420100377e-06, "epoch": 4.092682926829268, "percentage": 40.93, "elapsed_time": "1:04:31", "remaining_time": "1:33:08"} -{"current_steps": 840, "total_steps": 2050, "loss": 0.2536, "lr": 3.2029945484066883e-06, "epoch": 4.097560975609756, "percentage": 40.98, "elapsed_time": "1:04:33", "remaining_time": "1:33:00"} -{"current_steps": 841, "total_steps": 2050, "loss": 0.1221, "lr": 3.1993171038154203e-06, "epoch": 4.102439024390244, "percentage": 41.02, "elapsed_time": "1:04:39", "remaining_time": "1:32:56"} -{"current_steps": 842, "total_steps": 2050, "loss": 0.1231, "lr": 3.1956380168727385e-06, "epoch": 4.107317073170732, "percentage": 41.07, "elapsed_time": "1:04:45", "remaining_time": "1:32:54"} -{"current_steps": 843, "total_steps": 2050, "loss": 0.2144, "lr": 3.191957296219007e-06, "epoch": 4.11219512195122, "percentage": 41.12, "elapsed_time": "1:04:52", "remaining_time": "1:32:53"} -{"current_steps": 844, "total_steps": 2050, "loss": 0.1026, "lr": 3.1882749504984247e-06, "epoch": 4.117073170731707, "percentage": 41.17, "elapsed_time": "1:04:58", "remaining_time": "1:32:50"} -{"current_steps": 845, "total_steps": 2050, "loss": 0.1124, "lr": 3.1845909883590076e-06, "epoch": 4.121951219512195, "percentage": 41.22, "elapsed_time": "1:05:01", "remaining_time": "1:32:43"} -{"current_steps": 846, "total_steps": 2050, "loss": 0.2804, "lr": 3.180905418452569e-06, "epoch": 4.126829268292683, "percentage": 41.27, "elapsed_time": "1:05:03", "remaining_time": "1:32:35"} -{"current_steps": 847, "total_steps": 2050, "loss": 0.1748, "lr": 3.1772182494346963e-06, "epoch": 4.131707317073171, "percentage": 41.32, "elapsed_time": "1:05:07", "remaining_time": "1:32:29"} -{"current_steps": 848, "total_steps": 2050, "loss": 0.1984, "lr": 3.1735294899647344e-06, "epoch": 4.136585365853659, "percentage": 41.37, "elapsed_time": "1:05:10", "remaining_time": "1:32:22"} -{"current_steps": 849, "total_steps": 2050, "loss": 0.1332, "lr": 3.169839148705762e-06, "epoch": 4.1414634146341465, "percentage": 41.41, "elapsed_time": "1:05:13", "remaining_time": "1:32:15"} -{"current_steps": 850, "total_steps": 2050, "loss": 0.4788, "lr": 3.1661472343245725e-06, "epoch": 4.146341463414634, "percentage": 41.46, "elapsed_time": "1:05:18", "remaining_time": "1:32:12"} -{"current_steps": 851, "total_steps": 2050, "loss": 0.2437, "lr": 3.162453755491655e-06, "epoch": 4.151219512195122, "percentage": 41.51, "elapsed_time": "1:05:23", "remaining_time": "1:32:08"} -{"current_steps": 852, "total_steps": 2050, "loss": 0.203, "lr": 3.158758720881171e-06, "epoch": 4.15609756097561, "percentage": 41.56, "elapsed_time": "1:05:24", "remaining_time": "1:31:58"} -{"current_steps": 853, "total_steps": 2050, "loss": 0.1462, "lr": 3.155062139170937e-06, "epoch": 4.160975609756098, "percentage": 41.61, "elapsed_time": "1:05:26", "remaining_time": "1:31:49"} -{"current_steps": 854, "total_steps": 2050, "loss": 0.0972, "lr": 3.1513640190424034e-06, "epoch": 4.1658536585365855, "percentage": 41.66, "elapsed_time": "1:05:26", "remaining_time": "1:31:39"} -{"current_steps": 855, "total_steps": 2050, "loss": 0.1092, "lr": 3.147664369180632e-06, "epoch": 4.170731707317073, "percentage": 41.71, "elapsed_time": "1:05:28", "remaining_time": "1:31:30"} -{"current_steps": 856, "total_steps": 2050, "loss": 0.2215, "lr": 3.143963198274278e-06, "epoch": 4.175609756097561, "percentage": 41.76, "elapsed_time": "1:05:29", "remaining_time": "1:31:21"} -{"current_steps": 857, "total_steps": 2050, "loss": 0.1771, "lr": 3.140260515015569e-06, "epoch": 4.180487804878049, "percentage": 41.8, "elapsed_time": "1:05:33", "remaining_time": "1:31:15"} -{"current_steps": 858, "total_steps": 2050, "loss": 0.1995, "lr": 3.136556328100284e-06, "epoch": 4.185365853658537, "percentage": 41.85, "elapsed_time": "1:05:36", "remaining_time": "1:31:08"} -{"current_steps": 859, "total_steps": 2050, "loss": 0.4048, "lr": 3.132850646227734e-06, "epoch": 4.190243902439025, "percentage": 41.9, "elapsed_time": "1:05:38", "remaining_time": "1:31:00"} -{"current_steps": 860, "total_steps": 2050, "loss": 0.1914, "lr": 3.12914347810074e-06, "epoch": 4.195121951219512, "percentage": 41.95, "elapsed_time": "1:05:41", "remaining_time": "1:30:54"} -{"current_steps": 861, "total_steps": 2050, "loss": 0.1579, "lr": 3.125434832425613e-06, "epoch": 4.2, "percentage": 42.0, "elapsed_time": "1:05:47", "remaining_time": "1:30:50"} -{"current_steps": 862, "total_steps": 2050, "loss": 0.1814, "lr": 3.121724717912138e-06, "epoch": 4.204878048780488, "percentage": 42.05, "elapsed_time": "1:05:50", "remaining_time": "1:30:44"} -{"current_steps": 863, "total_steps": 2050, "loss": 0.1481, "lr": 3.118013143273542e-06, "epoch": 4.209756097560976, "percentage": 42.1, "elapsed_time": "1:05:54", "remaining_time": "1:30:38"} -{"current_steps": 864, "total_steps": 2050, "loss": 0.113, "lr": 3.1143001172264893e-06, "epoch": 4.214634146341464, "percentage": 42.15, "elapsed_time": "1:05:59", "remaining_time": "1:30:35"} -{"current_steps": 865, "total_steps": 2050, "loss": 0.1405, "lr": 3.1105856484910474e-06, "epoch": 4.219512195121951, "percentage": 42.2, "elapsed_time": "1:06:03", "remaining_time": "1:30:29"} -{"current_steps": 866, "total_steps": 2050, "loss": 0.097, "lr": 3.1068697457906736e-06, "epoch": 4.224390243902439, "percentage": 42.24, "elapsed_time": "1:06:08", "remaining_time": "1:30:25"} -{"current_steps": 867, "total_steps": 2050, "loss": 0.2207, "lr": 3.1031524178521938e-06, "epoch": 4.229268292682927, "percentage": 42.29, "elapsed_time": "1:06:09", "remaining_time": "1:30:16"} -{"current_steps": 868, "total_steps": 2050, "loss": 0.0552, "lr": 3.0994336734057804e-06, "epoch": 4.234146341463415, "percentage": 42.34, "elapsed_time": "1:06:10", "remaining_time": "1:30:07"} -{"current_steps": 869, "total_steps": 2050, "loss": 0.1743, "lr": 3.0957135211849315e-06, "epoch": 4.239024390243903, "percentage": 42.39, "elapsed_time": "1:06:14", "remaining_time": "1:30:01"} -{"current_steps": 870, "total_steps": 2050, "loss": 0.1195, "lr": 3.0919919699264535e-06, "epoch": 4.2439024390243905, "percentage": 42.44, "elapsed_time": "1:06:17", "remaining_time": "1:29:54"} -{"current_steps": 871, "total_steps": 2050, "loss": 0.6174, "lr": 3.0882690283704355e-06, "epoch": 4.248780487804878, "percentage": 42.49, "elapsed_time": "1:06:20", "remaining_time": "1:29:48"} -{"current_steps": 872, "total_steps": 2050, "loss": 0.1359, "lr": 3.084544705260234e-06, "epoch": 4.253658536585366, "percentage": 42.54, "elapsed_time": "1:06:26", "remaining_time": "1:29:45"} -{"current_steps": 873, "total_steps": 2050, "loss": 0.0786, "lr": 3.080819009342451e-06, "epoch": 4.258536585365854, "percentage": 42.59, "elapsed_time": "1:06:29", "remaining_time": "1:29:38"} -{"current_steps": 874, "total_steps": 2050, "loss": 0.0677, "lr": 3.077091949366908e-06, "epoch": 4.263414634146342, "percentage": 42.63, "elapsed_time": "1:06:32", "remaining_time": "1:29:31"} -{"current_steps": 875, "total_steps": 2050, "loss": 0.1084, "lr": 3.073363534086636e-06, "epoch": 4.2682926829268295, "percentage": 42.68, "elapsed_time": "1:06:35", "remaining_time": "1:29:25"} -{"current_steps": 876, "total_steps": 2050, "loss": 0.0681, "lr": 3.0696337722578444e-06, "epoch": 4.273170731707317, "percentage": 42.73, "elapsed_time": "1:06:37", "remaining_time": "1:29:17"} -{"current_steps": 877, "total_steps": 2050, "loss": 0.2262, "lr": 3.0659026726399072e-06, "epoch": 4.278048780487805, "percentage": 42.78, "elapsed_time": "1:06:44", "remaining_time": "1:29:15"} -{"current_steps": 878, "total_steps": 2050, "loss": 0.2169, "lr": 3.0621702439953393e-06, "epoch": 4.282926829268293, "percentage": 42.83, "elapsed_time": "1:06:49", "remaining_time": "1:29:12"} -{"current_steps": 879, "total_steps": 2050, "loss": 0.0581, "lr": 3.0584364950897768e-06, "epoch": 4.287804878048781, "percentage": 42.88, "elapsed_time": "1:06:51", "remaining_time": "1:29:03"} -{"current_steps": 880, "total_steps": 2050, "loss": 0.1687, "lr": 3.0547014346919574e-06, "epoch": 4.2926829268292686, "percentage": 42.93, "elapsed_time": "1:06:56", "remaining_time": "1:29:00"} -{"current_steps": 881, "total_steps": 2050, "loss": 0.1362, "lr": 3.0509650715736977e-06, "epoch": 4.297560975609756, "percentage": 42.98, "elapsed_time": "1:06:59", "remaining_time": "1:28:52"} -{"current_steps": 882, "total_steps": 2050, "loss": 0.1865, "lr": 3.0472274145098744e-06, "epoch": 4.302439024390244, "percentage": 43.02, "elapsed_time": "1:07:02", "remaining_time": "1:28:47"} -{"current_steps": 883, "total_steps": 2050, "loss": 0.2385, "lr": 3.0434884722784026e-06, "epoch": 4.307317073170732, "percentage": 43.07, "elapsed_time": "1:07:04", "remaining_time": "1:28:38"} -{"current_steps": 884, "total_steps": 2050, "loss": 0.1004, "lr": 3.0397482536602168e-06, "epoch": 4.31219512195122, "percentage": 43.12, "elapsed_time": "1:07:06", "remaining_time": "1:28:30"} -{"current_steps": 885, "total_steps": 2050, "loss": 0.1469, "lr": 3.0360067674392475e-06, "epoch": 4.317073170731708, "percentage": 43.17, "elapsed_time": "1:07:09", "remaining_time": "1:28:24"} -{"current_steps": 886, "total_steps": 2050, "loss": 0.0829, "lr": 3.0322640224024024e-06, "epoch": 4.321951219512195, "percentage": 43.22, "elapsed_time": "1:07:12", "remaining_time": "1:28:17"} -{"current_steps": 887, "total_steps": 2050, "loss": 0.2256, "lr": 3.0285200273395478e-06, "epoch": 4.326829268292683, "percentage": 43.27, "elapsed_time": "1:07:14", "remaining_time": "1:28:09"} -{"current_steps": 888, "total_steps": 2050, "loss": 0.2402, "lr": 3.024774791043481e-06, "epoch": 4.331707317073171, "percentage": 43.32, "elapsed_time": "1:07:17", "remaining_time": "1:28:02"} -{"current_steps": 889, "total_steps": 2050, "loss": 0.2198, "lr": 3.021028322309921e-06, "epoch": 4.336585365853659, "percentage": 43.37, "elapsed_time": "1:07:21", "remaining_time": "1:27:58"} -{"current_steps": 890, "total_steps": 2050, "loss": 0.2304, "lr": 3.0172806299374734e-06, "epoch": 4.341463414634147, "percentage": 43.41, "elapsed_time": "1:07:23", "remaining_time": "1:27:50"} -{"current_steps": 891, "total_steps": 2050, "loss": 0.2864, "lr": 3.0135317227276247e-06, "epoch": 4.3463414634146345, "percentage": 43.46, "elapsed_time": "1:07:27", "remaining_time": "1:27:44"} -{"current_steps": 892, "total_steps": 2050, "loss": 0.2045, "lr": 3.0097816094847104e-06, "epoch": 4.351219512195122, "percentage": 43.51, "elapsed_time": "1:07:30", "remaining_time": "1:27:38"} -{"current_steps": 893, "total_steps": 2050, "loss": 0.0864, "lr": 3.0060302990158984e-06, "epoch": 4.35609756097561, "percentage": 43.56, "elapsed_time": "1:07:35", "remaining_time": "1:27:34"} -{"current_steps": 894, "total_steps": 2050, "loss": 0.076, "lr": 3.002277800131171e-06, "epoch": 4.360975609756098, "percentage": 43.61, "elapsed_time": "1:07:37", "remaining_time": "1:27:25"} -{"current_steps": 895, "total_steps": 2050, "loss": 0.1724, "lr": 2.998524121643298e-06, "epoch": 4.365853658536586, "percentage": 43.66, "elapsed_time": "1:07:40", "remaining_time": "1:27:19"} -{"current_steps": 896, "total_steps": 2050, "loss": 0.2, "lr": 2.994769272367822e-06, "epoch": 4.3707317073170735, "percentage": 43.71, "elapsed_time": "1:07:42", "remaining_time": "1:27:11"} -{"current_steps": 897, "total_steps": 2050, "loss": 0.0852, "lr": 2.991013261123035e-06, "epoch": 4.375609756097561, "percentage": 43.76, "elapsed_time": "1:07:46", "remaining_time": "1:27:07"} -{"current_steps": 898, "total_steps": 2050, "loss": 0.1449, "lr": 2.9872560967299554e-06, "epoch": 4.380487804878049, "percentage": 43.8, "elapsed_time": "1:07:49", "remaining_time": "1:27:00"} -{"current_steps": 899, "total_steps": 2050, "loss": 0.0659, "lr": 2.9834977880123132e-06, "epoch": 4.385365853658537, "percentage": 43.85, "elapsed_time": "1:07:51", "remaining_time": "1:26:53"} -{"current_steps": 900, "total_steps": 2050, "loss": 0.1114, "lr": 2.9797383437965243e-06, "epoch": 4.390243902439025, "percentage": 43.9, "elapsed_time": "1:07:57", "remaining_time": "1:26:50"} -{"current_steps": 901, "total_steps": 2050, "loss": 0.0822, "lr": 2.975977772911671e-06, "epoch": 4.3951219512195125, "percentage": 43.95, "elapsed_time": "1:08:01", "remaining_time": "1:26:44"} -{"current_steps": 902, "total_steps": 2050, "loss": 0.0858, "lr": 2.972216084189482e-06, "epoch": 4.4, "percentage": 44.0, "elapsed_time": "1:08:03", "remaining_time": "1:26:37"} -{"current_steps": 903, "total_steps": 2050, "loss": 0.1162, "lr": 2.9684532864643123e-06, "epoch": 4.404878048780488, "percentage": 44.05, "elapsed_time": "1:08:08", "remaining_time": "1:26:32"} -{"current_steps": 904, "total_steps": 2050, "loss": 0.0821, "lr": 2.964689388573118e-06, "epoch": 4.409756097560976, "percentage": 44.1, "elapsed_time": "1:08:09", "remaining_time": "1:26:24"} -{"current_steps": 905, "total_steps": 2050, "loss": 0.25, "lr": 2.9609243993554434e-06, "epoch": 4.414634146341464, "percentage": 44.15, "elapsed_time": "1:08:13", "remaining_time": "1:26:19"} -{"current_steps": 906, "total_steps": 2050, "loss": 0.0852, "lr": 2.9571583276533923e-06, "epoch": 4.419512195121952, "percentage": 44.2, "elapsed_time": "1:08:14", "remaining_time": "1:26:10"} -{"current_steps": 907, "total_steps": 2050, "loss": 0.5123, "lr": 2.9533911823116124e-06, "epoch": 4.424390243902439, "percentage": 44.24, "elapsed_time": "1:08:16", "remaining_time": "1:26:02"} -{"current_steps": 908, "total_steps": 2050, "loss": 0.1854, "lr": 2.9496229721772734e-06, "epoch": 4.429268292682927, "percentage": 44.29, "elapsed_time": "1:08:17", "remaining_time": "1:25:53"} -{"current_steps": 909, "total_steps": 2050, "loss": 0.1785, "lr": 2.9458537061000435e-06, "epoch": 4.434146341463415, "percentage": 44.34, "elapsed_time": "1:08:22", "remaining_time": "1:25:49"} -{"current_steps": 910, "total_steps": 2050, "loss": 0.1603, "lr": 2.9420833929320726e-06, "epoch": 4.439024390243903, "percentage": 44.39, "elapsed_time": "1:08:29", "remaining_time": "1:25:47"} -{"current_steps": 911, "total_steps": 2050, "loss": 0.3046, "lr": 2.93831204152797e-06, "epoch": 4.443902439024391, "percentage": 44.44, "elapsed_time": "1:08:31", "remaining_time": "1:25:39"} -{"current_steps": 912, "total_steps": 2050, "loss": 0.0631, "lr": 2.9345396607447807e-06, "epoch": 4.4487804878048784, "percentage": 44.49, "elapsed_time": "1:08:32", "remaining_time": "1:25:32"} -{"current_steps": 913, "total_steps": 2050, "loss": 0.125, "lr": 2.9307662594419704e-06, "epoch": 4.453658536585366, "percentage": 44.54, "elapsed_time": "1:08:37", "remaining_time": "1:25:27"} -{"current_steps": 914, "total_steps": 2050, "loss": 0.156, "lr": 2.9269918464814e-06, "epoch": 4.458536585365854, "percentage": 44.59, "elapsed_time": "1:08:41", "remaining_time": "1:25:22"} -{"current_steps": 915, "total_steps": 2050, "loss": 0.3334, "lr": 2.923216430727306e-06, "epoch": 4.463414634146342, "percentage": 44.63, "elapsed_time": "1:08:44", "remaining_time": "1:25:16"} -{"current_steps": 916, "total_steps": 2050, "loss": 0.2534, "lr": 2.9194400210462808e-06, "epoch": 4.46829268292683, "percentage": 44.68, "elapsed_time": "1:08:50", "remaining_time": "1:25:13"} -{"current_steps": 917, "total_steps": 2050, "loss": 0.352, "lr": 2.91566262630725e-06, "epoch": 4.473170731707317, "percentage": 44.73, "elapsed_time": "1:08:53", "remaining_time": "1:25:07"} -{"current_steps": 918, "total_steps": 2050, "loss": 0.1132, "lr": 2.9118842553814526e-06, "epoch": 4.478048780487805, "percentage": 44.78, "elapsed_time": "1:08:57", "remaining_time": "1:25:02"} -{"current_steps": 919, "total_steps": 2050, "loss": 0.086, "lr": 2.9081049171424223e-06, "epoch": 4.482926829268292, "percentage": 44.83, "elapsed_time": "1:08:59", "remaining_time": "1:24:54"} -{"current_steps": 920, "total_steps": 2050, "loss": 0.0693, "lr": 2.9043246204659624e-06, "epoch": 4.487804878048781, "percentage": 44.88, "elapsed_time": "1:09:03", "remaining_time": "1:24:49"} -{"current_steps": 921, "total_steps": 2050, "loss": 0.2463, "lr": 2.9005433742301274e-06, "epoch": 4.492682926829268, "percentage": 44.93, "elapsed_time": "1:09:09", "remaining_time": "1:24:46"} -{"current_steps": 922, "total_steps": 2050, "loss": 0.2275, "lr": 2.8967611873152037e-06, "epoch": 4.4975609756097565, "percentage": 44.98, "elapsed_time": "1:09:11", "remaining_time": "1:24:38"} -{"current_steps": 923, "total_steps": 2050, "loss": 0.0752, "lr": 2.892978068603683e-06, "epoch": 4.5024390243902435, "percentage": 45.02, "elapsed_time": "1:09:13", "remaining_time": "1:24:31"} -{"current_steps": 924, "total_steps": 2050, "loss": 0.1649, "lr": 2.889194026980249e-06, "epoch": 4.507317073170732, "percentage": 45.07, "elapsed_time": "1:09:17", "remaining_time": "1:24:26"} -{"current_steps": 925, "total_steps": 2050, "loss": 0.0437, "lr": 2.8854090713317514e-06, "epoch": 4.512195121951219, "percentage": 45.12, "elapsed_time": "1:09:19", "remaining_time": "1:24:18"} -{"current_steps": 926, "total_steps": 2050, "loss": 0.0747, "lr": 2.8816232105471864e-06, "epoch": 4.517073170731708, "percentage": 45.17, "elapsed_time": "1:09:22", "remaining_time": "1:24:12"} -{"current_steps": 927, "total_steps": 2050, "loss": 0.3884, "lr": 2.877836453517677e-06, "epoch": 4.521951219512195, "percentage": 45.22, "elapsed_time": "1:09:28", "remaining_time": "1:24:09"} -{"current_steps": 928, "total_steps": 2050, "loss": 0.2741, "lr": 2.8740488091364492e-06, "epoch": 4.526829268292683, "percentage": 45.27, "elapsed_time": "1:09:30", "remaining_time": "1:24:02"} -{"current_steps": 929, "total_steps": 2050, "loss": 0.364, "lr": 2.870260286298814e-06, "epoch": 4.53170731707317, "percentage": 45.32, "elapsed_time": "1:09:33", "remaining_time": "1:23:55"} -{"current_steps": 930, "total_steps": 2050, "loss": 0.1495, "lr": 2.866470893902147e-06, "epoch": 4.536585365853659, "percentage": 45.37, "elapsed_time": "1:09:35", "remaining_time": "1:23:48"} -{"current_steps": 931, "total_steps": 2050, "loss": 0.1403, "lr": 2.8626806408458626e-06, "epoch": 4.541463414634146, "percentage": 45.41, "elapsed_time": "1:09:41", "remaining_time": "1:23:45"} -{"current_steps": 932, "total_steps": 2050, "loss": 0.0946, "lr": 2.8588895360313983e-06, "epoch": 4.546341463414635, "percentage": 45.46, "elapsed_time": "1:09:45", "remaining_time": "1:23:40"} -{"current_steps": 933, "total_steps": 2050, "loss": 0.1851, "lr": 2.8550975883621935e-06, "epoch": 4.5512195121951216, "percentage": 45.51, "elapsed_time": "1:09:48", "remaining_time": "1:23:34"} -{"current_steps": 934, "total_steps": 2050, "loss": 0.178, "lr": 2.8513048067436644e-06, "epoch": 4.55609756097561, "percentage": 45.56, "elapsed_time": "1:09:51", "remaining_time": "1:23:28"} -{"current_steps": 935, "total_steps": 2050, "loss": 0.1131, "lr": 2.847511200083187e-06, "epoch": 4.560975609756097, "percentage": 45.61, "elapsed_time": "1:09:54", "remaining_time": "1:23:22"} -{"current_steps": 936, "total_steps": 2050, "loss": 0.1251, "lr": 2.843716777290074e-06, "epoch": 4.565853658536585, "percentage": 45.66, "elapsed_time": "1:09:58", "remaining_time": "1:23:17"} -{"current_steps": 937, "total_steps": 2050, "loss": 0.0946, "lr": 2.839921547275556e-06, "epoch": 4.570731707317073, "percentage": 45.71, "elapsed_time": "1:10:02", "remaining_time": "1:23:12"} -{"current_steps": 938, "total_steps": 2050, "loss": 0.1529, "lr": 2.836125518952759e-06, "epoch": 4.575609756097561, "percentage": 45.76, "elapsed_time": "1:10:04", "remaining_time": "1:23:04"} -{"current_steps": 939, "total_steps": 2050, "loss": 0.2511, "lr": 2.8323287012366845e-06, "epoch": 4.580487804878048, "percentage": 45.8, "elapsed_time": "1:10:09", "remaining_time": "1:22:59"} -{"current_steps": 940, "total_steps": 2050, "loss": 0.1474, "lr": 2.828531103044186e-06, "epoch": 4.585365853658536, "percentage": 45.85, "elapsed_time": "1:10:15", "remaining_time": "1:22:58"} -{"current_steps": 941, "total_steps": 2050, "loss": 0.2249, "lr": 2.8247327332939512e-06, "epoch": 4.590243902439024, "percentage": 45.9, "elapsed_time": "1:10:22", "remaining_time": "1:22:55"} -{"current_steps": 942, "total_steps": 2050, "loss": 0.2258, "lr": 2.82093360090648e-06, "epoch": 4.595121951219512, "percentage": 45.95, "elapsed_time": "1:10:27", "remaining_time": "1:22:52"} -{"current_steps": 943, "total_steps": 2050, "loss": 0.2235, "lr": 2.8171337148040636e-06, "epoch": 4.6, "percentage": 46.0, "elapsed_time": "1:10:31", "remaining_time": "1:22:47"} -{"current_steps": 944, "total_steps": 2050, "loss": 0.1562, "lr": 2.813333083910761e-06, "epoch": 4.6048780487804875, "percentage": 46.05, "elapsed_time": "1:10:34", "remaining_time": "1:22:41"} -{"current_steps": 945, "total_steps": 2050, "loss": 0.0625, "lr": 2.8095317171523835e-06, "epoch": 4.609756097560975, "percentage": 46.1, "elapsed_time": "1:10:37", "remaining_time": "1:22:35"} -{"current_steps": 946, "total_steps": 2050, "loss": 0.2205, "lr": 2.805729623456469e-06, "epoch": 4.614634146341463, "percentage": 46.15, "elapsed_time": "1:10:43", "remaining_time": "1:22:32"} -{"current_steps": 947, "total_steps": 2050, "loss": 0.3241, "lr": 2.8019268117522624e-06, "epoch": 4.619512195121951, "percentage": 46.2, "elapsed_time": "1:10:45", "remaining_time": "1:22:24"} -{"current_steps": 948, "total_steps": 2050, "loss": 0.1983, "lr": 2.798123290970695e-06, "epoch": 4.624390243902439, "percentage": 46.24, "elapsed_time": "1:10:50", "remaining_time": "1:22:21"} -{"current_steps": 949, "total_steps": 2050, "loss": 0.3391, "lr": 2.794319070044365e-06, "epoch": 4.6292682926829265, "percentage": 46.29, "elapsed_time": "1:10:58", "remaining_time": "1:22:20"} -{"current_steps": 950, "total_steps": 2050, "loss": 0.1329, "lr": 2.790514157907512e-06, "epoch": 4.634146341463414, "percentage": 46.34, "elapsed_time": "1:11:00", "remaining_time": "1:22:12"} -{"current_steps": 951, "total_steps": 2050, "loss": 0.141, "lr": 2.786708563496002e-06, "epoch": 4.639024390243902, "percentage": 46.39, "elapsed_time": "1:11:07", "remaining_time": "1:22:11"} -{"current_steps": 952, "total_steps": 2050, "loss": 0.2935, "lr": 2.782902295747299e-06, "epoch": 4.64390243902439, "percentage": 46.44, "elapsed_time": "1:11:10", "remaining_time": "1:22:05"} -{"current_steps": 953, "total_steps": 2050, "loss": 0.318, "lr": 2.7790953636004536e-06, "epoch": 4.648780487804878, "percentage": 46.49, "elapsed_time": "1:11:14", "remaining_time": "1:22:00"} -{"current_steps": 954, "total_steps": 2050, "loss": 0.3388, "lr": 2.775287775996074e-06, "epoch": 4.6536585365853655, "percentage": 46.54, "elapsed_time": "1:11:17", "remaining_time": "1:21:54"} -{"current_steps": 955, "total_steps": 2050, "loss": 0.0925, "lr": 2.7714795418763067e-06, "epoch": 4.658536585365853, "percentage": 46.59, "elapsed_time": "1:11:20", "remaining_time": "1:21:47"} -{"current_steps": 956, "total_steps": 2050, "loss": 0.2811, "lr": 2.7676706701848187e-06, "epoch": 4.663414634146341, "percentage": 46.63, "elapsed_time": "1:11:22", "remaining_time": "1:21:40"} -{"current_steps": 957, "total_steps": 2050, "loss": 0.311, "lr": 2.763861169866774e-06, "epoch": 4.668292682926829, "percentage": 46.68, "elapsed_time": "1:11:29", "remaining_time": "1:21:39"} -{"current_steps": 958, "total_steps": 2050, "loss": 0.0582, "lr": 2.7600510498688104e-06, "epoch": 4.673170731707317, "percentage": 46.73, "elapsed_time": "1:11:33", "remaining_time": "1:21:34"} -{"current_steps": 959, "total_steps": 2050, "loss": 0.1238, "lr": 2.7562403191390246e-06, "epoch": 4.678048780487805, "percentage": 46.78, "elapsed_time": "1:11:36", "remaining_time": "1:21:27"} -{"current_steps": 960, "total_steps": 2050, "loss": 0.1243, "lr": 2.7524289866269467e-06, "epoch": 4.682926829268292, "percentage": 46.83, "elapsed_time": "1:11:41", "remaining_time": "1:21:23"} -{"current_steps": 961, "total_steps": 2050, "loss": 0.1388, "lr": 2.748617061283518e-06, "epoch": 4.68780487804878, "percentage": 46.88, "elapsed_time": "1:11:48", "remaining_time": "1:21:21"} -{"current_steps": 962, "total_steps": 2050, "loss": 0.1144, "lr": 2.744804552061074e-06, "epoch": 4.692682926829268, "percentage": 46.93, "elapsed_time": "1:11:49", "remaining_time": "1:21:13"} -{"current_steps": 963, "total_steps": 2050, "loss": 0.2155, "lr": 2.740991467913321e-06, "epoch": 4.697560975609756, "percentage": 46.98, "elapsed_time": "1:11:52", "remaining_time": "1:21:07"} -{"current_steps": 964, "total_steps": 2050, "loss": 0.0983, "lr": 2.737177817795315e-06, "epoch": 4.702439024390244, "percentage": 47.02, "elapsed_time": "1:11:58", "remaining_time": "1:21:05"} -{"current_steps": 965, "total_steps": 2050, "loss": 0.1365, "lr": 2.7333636106634414e-06, "epoch": 4.7073170731707314, "percentage": 47.07, "elapsed_time": "1:12:01", "remaining_time": "1:20:59"} -{"current_steps": 966, "total_steps": 2050, "loss": 0.1977, "lr": 2.7295488554753957e-06, "epoch": 4.712195121951219, "percentage": 47.12, "elapsed_time": "1:12:03", "remaining_time": "1:20:51"} -{"current_steps": 967, "total_steps": 2050, "loss": 0.1311, "lr": 2.725733561190157e-06, "epoch": 4.717073170731707, "percentage": 47.17, "elapsed_time": "1:12:09", "remaining_time": "1:20:49"} -{"current_steps": 968, "total_steps": 2050, "loss": 0.2464, "lr": 2.721917736767973e-06, "epoch": 4.721951219512195, "percentage": 47.22, "elapsed_time": "1:12:13", "remaining_time": "1:20:43"} -{"current_steps": 969, "total_steps": 2050, "loss": 0.1088, "lr": 2.7181013911703357e-06, "epoch": 4.726829268292683, "percentage": 47.27, "elapsed_time": "1:12:19", "remaining_time": "1:20:40"} -{"current_steps": 970, "total_steps": 2050, "loss": 0.1492, "lr": 2.714284533359961e-06, "epoch": 4.7317073170731705, "percentage": 47.32, "elapsed_time": "1:12:25", "remaining_time": "1:20:38"} -{"current_steps": 971, "total_steps": 2050, "loss": 0.218, "lr": 2.710467172300768e-06, "epoch": 4.736585365853658, "percentage": 47.37, "elapsed_time": "1:12:29", "remaining_time": "1:20:32"} -{"current_steps": 972, "total_steps": 2050, "loss": 0.2199, "lr": 2.706649316957857e-06, "epoch": 4.741463414634146, "percentage": 47.41, "elapsed_time": "1:12:33", "remaining_time": "1:20:28"} -{"current_steps": 973, "total_steps": 2050, "loss": 0.0595, "lr": 2.7028309762974897e-06, "epoch": 4.746341463414634, "percentage": 47.46, "elapsed_time": "1:12:35", "remaining_time": "1:20:21"} -{"current_steps": 974, "total_steps": 2050, "loss": 0.1653, "lr": 2.699012159287069e-06, "epoch": 4.751219512195122, "percentage": 47.51, "elapsed_time": "1:12:37", "remaining_time": "1:20:14"} -{"current_steps": 975, "total_steps": 2050, "loss": 0.0681, "lr": 2.6951928748951125e-06, "epoch": 4.7560975609756095, "percentage": 47.56, "elapsed_time": "1:12:42", "remaining_time": "1:20:09"} -{"current_steps": 976, "total_steps": 2050, "loss": 0.2046, "lr": 2.69137313209124e-06, "epoch": 4.760975609756097, "percentage": 47.61, "elapsed_time": "1:12:44", "remaining_time": "1:20:02"} -{"current_steps": 977, "total_steps": 2050, "loss": 0.2255, "lr": 2.687552939846145e-06, "epoch": 4.765853658536585, "percentage": 47.66, "elapsed_time": "1:12:46", "remaining_time": "1:19:55"} -{"current_steps": 978, "total_steps": 2050, "loss": 0.0512, "lr": 2.6837323071315766e-06, "epoch": 4.770731707317073, "percentage": 47.71, "elapsed_time": "1:12:48", "remaining_time": "1:19:47"} -{"current_steps": 979, "total_steps": 2050, "loss": 0.162, "lr": 2.679911242920321e-06, "epoch": 4.775609756097561, "percentage": 47.76, "elapsed_time": "1:12:50", "remaining_time": "1:19:40"} -{"current_steps": 980, "total_steps": 2050, "loss": 0.398, "lr": 2.6760897561861742e-06, "epoch": 4.780487804878049, "percentage": 47.8, "elapsed_time": "1:12:51", "remaining_time": "1:19:33"} -{"current_steps": 981, "total_steps": 2050, "loss": 0.0507, "lr": 2.672267855903927e-06, "epoch": 4.785365853658536, "percentage": 47.85, "elapsed_time": "1:12:54", "remaining_time": "1:19:26"} -{"current_steps": 982, "total_steps": 2050, "loss": 0.2066, "lr": 2.6684455510493413e-06, "epoch": 4.790243902439024, "percentage": 47.9, "elapsed_time": "1:12:59", "remaining_time": "1:19:22"} -{"current_steps": 983, "total_steps": 2050, "loss": 0.2296, "lr": 2.6646228505991267e-06, "epoch": 4.795121951219512, "percentage": 47.95, "elapsed_time": "1:13:03", "remaining_time": "1:19:17"} -{"current_steps": 984, "total_steps": 2050, "loss": 0.14, "lr": 2.6607997635309246e-06, "epoch": 4.8, "percentage": 48.0, "elapsed_time": "1:13:07", "remaining_time": "1:19:13"} -{"current_steps": 985, "total_steps": 2050, "loss": 0.1583, "lr": 2.6569762988232838e-06, "epoch": 4.804878048780488, "percentage": 48.05, "elapsed_time": "1:13:11", "remaining_time": "1:19:07"} -{"current_steps": 986, "total_steps": 2050, "loss": 0.2619, "lr": 2.653152465455639e-06, "epoch": 4.809756097560975, "percentage": 48.1, "elapsed_time": "1:13:13", "remaining_time": "1:19:01"} -{"current_steps": 987, "total_steps": 2050, "loss": 0.3029, "lr": 2.6493282724082913e-06, "epoch": 4.814634146341463, "percentage": 48.15, "elapsed_time": "1:13:15", "remaining_time": "1:18:53"} -{"current_steps": 988, "total_steps": 2050, "loss": 0.095, "lr": 2.6455037286623864e-06, "epoch": 4.819512195121951, "percentage": 48.2, "elapsed_time": "1:13:17", "remaining_time": "1:18:46"} -{"current_steps": 989, "total_steps": 2050, "loss": 0.1232, "lr": 2.6416788431998935e-06, "epoch": 4.824390243902439, "percentage": 48.24, "elapsed_time": "1:13:21", "remaining_time": "1:18:42"} -{"current_steps": 990, "total_steps": 2050, "loss": 0.3671, "lr": 2.637853625003585e-06, "epoch": 4.829268292682927, "percentage": 48.29, "elapsed_time": "1:13:24", "remaining_time": "1:18:36"} -{"current_steps": 991, "total_steps": 2050, "loss": 0.194, "lr": 2.6340280830570142e-06, "epoch": 4.8341463414634145, "percentage": 48.34, "elapsed_time": "1:13:30", "remaining_time": "1:18:33"} -{"current_steps": 992, "total_steps": 2050, "loss": 0.1214, "lr": 2.6302022263444947e-06, "epoch": 4.839024390243902, "percentage": 48.39, "elapsed_time": "1:13:32", "remaining_time": "1:18:25"} -{"current_steps": 993, "total_steps": 2050, "loss": 0.311, "lr": 2.6263760638510793e-06, "epoch": 4.84390243902439, "percentage": 48.44, "elapsed_time": "1:13:35", "remaining_time": "1:18:19"} -{"current_steps": 994, "total_steps": 2050, "loss": 0.1853, "lr": 2.6225496045625394e-06, "epoch": 4.848780487804878, "percentage": 48.49, "elapsed_time": "1:13:38", "remaining_time": "1:18:13"} -{"current_steps": 995, "total_steps": 2050, "loss": 0.2088, "lr": 2.6187228574653428e-06, "epoch": 4.853658536585366, "percentage": 48.54, "elapsed_time": "1:13:39", "remaining_time": "1:18:06"} -{"current_steps": 996, "total_steps": 2050, "loss": 0.1439, "lr": 2.614895831546633e-06, "epoch": 4.8585365853658535, "percentage": 48.59, "elapsed_time": "1:13:46", "remaining_time": "1:18:04"} -{"current_steps": 997, "total_steps": 2050, "loss": 0.2823, "lr": 2.6110685357942096e-06, "epoch": 4.863414634146341, "percentage": 48.63, "elapsed_time": "1:13:49", "remaining_time": "1:17:58"} -{"current_steps": 998, "total_steps": 2050, "loss": 0.2963, "lr": 2.6072409791965048e-06, "epoch": 4.868292682926829, "percentage": 48.68, "elapsed_time": "1:13:50", "remaining_time": "1:17:50"} -{"current_steps": 999, "total_steps": 2050, "loss": 0.4127, "lr": 2.6034131707425638e-06, "epoch": 4.873170731707317, "percentage": 48.73, "elapsed_time": "1:13:54", "remaining_time": "1:17:45"} -{"current_steps": 1000, "total_steps": 2050, "loss": 0.1601, "lr": 2.5995851194220223e-06, "epoch": 4.878048780487805, "percentage": 48.78, "elapsed_time": "1:14:00", "remaining_time": "1:17:42"} -{"current_steps": 1001, "total_steps": 2050, "loss": 0.161, "lr": 2.595756834225089e-06, "epoch": 4.882926829268293, "percentage": 48.83, "elapsed_time": "1:14:06", "remaining_time": "1:17:39"} -{"current_steps": 1002, "total_steps": 2050, "loss": 0.1013, "lr": 2.5919283241425188e-06, "epoch": 4.88780487804878, "percentage": 48.88, "elapsed_time": "1:14:07", "remaining_time": "1:17:32"} -{"current_steps": 1003, "total_steps": 2050, "loss": 0.1177, "lr": 2.5880995981655965e-06, "epoch": 4.892682926829268, "percentage": 48.93, "elapsed_time": "1:14:15", "remaining_time": "1:17:30"} -{"current_steps": 1004, "total_steps": 2050, "loss": 0.0682, "lr": 2.584270665286113e-06, "epoch": 4.897560975609756, "percentage": 48.98, "elapsed_time": "1:14:16", "remaining_time": "1:17:23"} -{"current_steps": 1005, "total_steps": 2050, "loss": 0.1502, "lr": 2.580441534496346e-06, "epoch": 4.902439024390244, "percentage": 49.02, "elapsed_time": "1:14:19", "remaining_time": "1:17:17"} -{"current_steps": 1006, "total_steps": 2050, "loss": 0.1772, "lr": 2.576612214789039e-06, "epoch": 4.907317073170732, "percentage": 49.07, "elapsed_time": "1:14:23", "remaining_time": "1:17:11"} -{"current_steps": 1007, "total_steps": 2050, "loss": 0.2029, "lr": 2.5727827151573747e-06, "epoch": 4.912195121951219, "percentage": 49.12, "elapsed_time": "1:14:28", "remaining_time": "1:17:08"} -{"current_steps": 1008, "total_steps": 2050, "loss": 0.1269, "lr": 2.568953044594964e-06, "epoch": 4.917073170731707, "percentage": 49.17, "elapsed_time": "1:14:30", "remaining_time": "1:17:01"} -{"current_steps": 1009, "total_steps": 2050, "loss": 0.1311, "lr": 2.5651232120958157e-06, "epoch": 4.921951219512195, "percentage": 49.22, "elapsed_time": "1:14:34", "remaining_time": "1:16:56"} -{"current_steps": 1010, "total_steps": 2050, "loss": 0.1085, "lr": 2.56129322665432e-06, "epoch": 4.926829268292683, "percentage": 49.27, "elapsed_time": "1:14:35", "remaining_time": "1:16:48"} -{"current_steps": 1011, "total_steps": 2050, "loss": 0.0782, "lr": 2.5574630972652263e-06, "epoch": 4.931707317073171, "percentage": 49.32, "elapsed_time": "1:14:39", "remaining_time": "1:16:43"} -{"current_steps": 1012, "total_steps": 2050, "loss": 0.1391, "lr": 2.553632832923622e-06, "epoch": 4.9365853658536585, "percentage": 49.37, "elapsed_time": "1:14:41", "remaining_time": "1:16:36"} -{"current_steps": 1013, "total_steps": 2050, "loss": 0.3141, "lr": 2.5498024426249107e-06, "epoch": 4.941463414634146, "percentage": 49.41, "elapsed_time": "1:14:44", "remaining_time": "1:16:30"} -{"current_steps": 1014, "total_steps": 2050, "loss": 0.0679, "lr": 2.545971935364794e-06, "epoch": 4.946341463414634, "percentage": 49.46, "elapsed_time": "1:14:47", "remaining_time": "1:16:25"} -{"current_steps": 1015, "total_steps": 2050, "loss": 0.1382, "lr": 2.5421413201392443e-06, "epoch": 4.951219512195122, "percentage": 49.51, "elapsed_time": "1:14:51", "remaining_time": "1:16:19"} -{"current_steps": 1016, "total_steps": 2050, "loss": 0.112, "lr": 2.538310605944491e-06, "epoch": 4.95609756097561, "percentage": 49.56, "elapsed_time": "1:14:55", "remaining_time": "1:16:15"} -{"current_steps": 1017, "total_steps": 2050, "loss": 0.1261, "lr": 2.534479801776996e-06, "epoch": 4.9609756097560975, "percentage": 49.61, "elapsed_time": "1:14:57", "remaining_time": "1:16:08"} -{"current_steps": 1018, "total_steps": 2050, "loss": 0.2328, "lr": 2.53064891663343e-06, "epoch": 4.965853658536585, "percentage": 49.66, "elapsed_time": "1:15:01", "remaining_time": "1:16:03"} -{"current_steps": 1019, "total_steps": 2050, "loss": 0.193, "lr": 2.526817959510655e-06, "epoch": 4.970731707317073, "percentage": 49.71, "elapsed_time": "1:15:05", "remaining_time": "1:15:58"} -{"current_steps": 1020, "total_steps": 2050, "loss": 0.2444, "lr": 2.5229869394057038e-06, "epoch": 4.975609756097561, "percentage": 49.76, "elapsed_time": "1:15:09", "remaining_time": "1:15:53"} -{"current_steps": 1021, "total_steps": 2050, "loss": 0.1103, "lr": 2.5191558653157542e-06, "epoch": 4.980487804878049, "percentage": 49.8, "elapsed_time": "1:15:13", "remaining_time": "1:15:49"} -{"current_steps": 1022, "total_steps": 2050, "loss": 0.0553, "lr": 2.515324746238113e-06, "epoch": 4.985365853658537, "percentage": 49.85, "elapsed_time": "1:15:15", "remaining_time": "1:15:42"} -{"current_steps": 1023, "total_steps": 2050, "loss": 0.1686, "lr": 2.511493591170191e-06, "epoch": 4.990243902439024, "percentage": 49.9, "elapsed_time": "1:15:19", "remaining_time": "1:15:36"} -{"current_steps": 1024, "total_steps": 2050, "loss": 0.1208, "lr": 2.5076624091094846e-06, "epoch": 4.995121951219512, "percentage": 49.95, "elapsed_time": "1:15:22", "remaining_time": "1:15:31"} -{"current_steps": 1025, "total_steps": 2050, "loss": 0.1216, "lr": 2.503831209053554e-06, "epoch": 5.0, "percentage": 50.0, "elapsed_time": "1:15:28", "remaining_time": "1:15:28"} -{"current_steps": 1026, "total_steps": 2050, "loss": 0.0984, "lr": 2.5e-06, "epoch": 5.004878048780488, "percentage": 50.05, "elapsed_time": "1:21:16", "remaining_time": "1:21:07"} -{"current_steps": 1027, "total_steps": 2050, "loss": 0.1323, "lr": 2.4961687909464462e-06, "epoch": 5.009756097560976, "percentage": 50.1, "elapsed_time": "1:21:21", "remaining_time": "1:21:02"} -{"current_steps": 1028, "total_steps": 2050, "loss": 0.3516, "lr": 2.492337590890516e-06, "epoch": 5.014634146341463, "percentage": 50.15, "elapsed_time": "1:21:25", "remaining_time": "1:20:57"} -{"current_steps": 1029, "total_steps": 2050, "loss": 0.1931, "lr": 2.4885064088298097e-06, "epoch": 5.019512195121951, "percentage": 50.2, "elapsed_time": "1:21:30", "remaining_time": "1:20:52"} -{"current_steps": 1030, "total_steps": 2050, "loss": 0.0675, "lr": 2.4846752537618875e-06, "epoch": 5.024390243902439, "percentage": 50.24, "elapsed_time": "1:21:35", "remaining_time": "1:20:48"} -{"current_steps": 1031, "total_steps": 2050, "loss": 0.1643, "lr": 2.480844134684246e-06, "epoch": 5.029268292682927, "percentage": 50.29, "elapsed_time": "1:21:38", "remaining_time": "1:20:41"} -{"current_steps": 1032, "total_steps": 2050, "loss": 0.11, "lr": 2.4770130605942966e-06, "epoch": 5.034146341463415, "percentage": 50.34, "elapsed_time": "1:21:43", "remaining_time": "1:20:37"} -{"current_steps": 1033, "total_steps": 2050, "loss": 0.0614, "lr": 2.4731820404893457e-06, "epoch": 5.0390243902439025, "percentage": 50.39, "elapsed_time": "1:21:47", "remaining_time": "1:20:31"} -{"current_steps": 1034, "total_steps": 2050, "loss": 0.0954, "lr": 2.469351083366571e-06, "epoch": 5.04390243902439, "percentage": 50.44, "elapsed_time": "1:21:48", "remaining_time": "1:20:23"} -{"current_steps": 1035, "total_steps": 2050, "loss": 0.0275, "lr": 2.4655201982230044e-06, "epoch": 5.048780487804878, "percentage": 50.49, "elapsed_time": "1:21:51", "remaining_time": "1:20:16"} -{"current_steps": 1036, "total_steps": 2050, "loss": 0.0661, "lr": 2.4616893940555094e-06, "epoch": 5.053658536585366, "percentage": 50.54, "elapsed_time": "1:21:53", "remaining_time": "1:20:09"} -{"current_steps": 1037, "total_steps": 2050, "loss": 0.2976, "lr": 2.457858679860757e-06, "epoch": 5.058536585365854, "percentage": 50.59, "elapsed_time": "1:21:57", "remaining_time": "1:20:03"} -{"current_steps": 1038, "total_steps": 2050, "loss": 0.1216, "lr": 2.4540280646352072e-06, "epoch": 5.0634146341463415, "percentage": 50.63, "elapsed_time": "1:21:58", "remaining_time": "1:19:55"} -{"current_steps": 1039, "total_steps": 2050, "loss": 0.0877, "lr": 2.45019755737509e-06, "epoch": 5.068292682926829, "percentage": 50.68, "elapsed_time": "1:22:01", "remaining_time": "1:19:48"} -{"current_steps": 1040, "total_steps": 2050, "loss": 0.1661, "lr": 2.4463671670763787e-06, "epoch": 5.073170731707317, "percentage": 50.73, "elapsed_time": "1:22:04", "remaining_time": "1:19:42"} -{"current_steps": 1041, "total_steps": 2050, "loss": 0.211, "lr": 2.4425369027347746e-06, "epoch": 5.078048780487805, "percentage": 50.78, "elapsed_time": "1:22:10", "remaining_time": "1:19:38"} -{"current_steps": 1042, "total_steps": 2050, "loss": 0.093, "lr": 2.4387067733456804e-06, "epoch": 5.082926829268293, "percentage": 50.83, "elapsed_time": "1:22:13", "remaining_time": "1:19:32"} -{"current_steps": 1043, "total_steps": 2050, "loss": 0.0777, "lr": 2.4348767879041847e-06, "epoch": 5.087804878048781, "percentage": 50.88, "elapsed_time": "1:22:14", "remaining_time": "1:19:24"} -{"current_steps": 1044, "total_steps": 2050, "loss": 0.087, "lr": 2.4310469554050366e-06, "epoch": 5.092682926829268, "percentage": 50.93, "elapsed_time": "1:22:15", "remaining_time": "1:19:16"} -{"current_steps": 1045, "total_steps": 2050, "loss": 0.1105, "lr": 2.4272172848426257e-06, "epoch": 5.097560975609756, "percentage": 50.98, "elapsed_time": "1:22:20", "remaining_time": "1:19:11"} -{"current_steps": 1046, "total_steps": 2050, "loss": 0.0704, "lr": 2.423387785210962e-06, "epoch": 5.102439024390244, "percentage": 51.02, "elapsed_time": "1:22:25", "remaining_time": "1:19:07"} -{"current_steps": 1047, "total_steps": 2050, "loss": 0.2118, "lr": 2.4195584655036544e-06, "epoch": 5.107317073170732, "percentage": 51.07, "elapsed_time": "1:22:29", "remaining_time": "1:19:01"} -{"current_steps": 1048, "total_steps": 2050, "loss": 0.0664, "lr": 2.4157293347138877e-06, "epoch": 5.11219512195122, "percentage": 51.12, "elapsed_time": "1:22:36", "remaining_time": "1:18:58"} -{"current_steps": 1049, "total_steps": 2050, "loss": 0.1767, "lr": 2.4119004018344043e-06, "epoch": 5.117073170731707, "percentage": 51.17, "elapsed_time": "1:22:39", "remaining_time": "1:18:52"} -{"current_steps": 1050, "total_steps": 2050, "loss": 0.1288, "lr": 2.408071675857482e-06, "epoch": 5.121951219512195, "percentage": 51.22, "elapsed_time": "1:22:44", "remaining_time": "1:18:48"} -{"current_steps": 1051, "total_steps": 2050, "loss": 0.1724, "lr": 2.404243165774912e-06, "epoch": 5.126829268292683, "percentage": 51.27, "elapsed_time": "1:22:47", "remaining_time": "1:18:42"} -{"current_steps": 1052, "total_steps": 2050, "loss": 0.0382, "lr": 2.4004148805779785e-06, "epoch": 5.131707317073171, "percentage": 51.32, "elapsed_time": "1:22:51", "remaining_time": "1:18:36"} -{"current_steps": 1053, "total_steps": 2050, "loss": 0.0942, "lr": 2.3965868292574375e-06, "epoch": 5.136585365853659, "percentage": 51.37, "elapsed_time": "1:22:54", "remaining_time": "1:18:29"} -{"current_steps": 1054, "total_steps": 2050, "loss": 0.0819, "lr": 2.392759020803496e-06, "epoch": 5.1414634146341465, "percentage": 51.41, "elapsed_time": "1:22:56", "remaining_time": "1:18:22"} -{"current_steps": 1055, "total_steps": 2050, "loss": 0.0866, "lr": 2.3889314642057916e-06, "epoch": 5.146341463414634, "percentage": 51.46, "elapsed_time": "1:22:58", "remaining_time": "1:18:15"} -{"current_steps": 1056, "total_steps": 2050, "loss": 0.1557, "lr": 2.3851041684533677e-06, "epoch": 5.151219512195122, "percentage": 51.51, "elapsed_time": "1:23:05", "remaining_time": "1:18:12"} -{"current_steps": 1057, "total_steps": 2050, "loss": 0.0421, "lr": 2.381277142534658e-06, "epoch": 5.15609756097561, "percentage": 51.56, "elapsed_time": "1:23:12", "remaining_time": "1:18:10"} -{"current_steps": 1058, "total_steps": 2050, "loss": 0.0395, "lr": 2.3774503954374614e-06, "epoch": 5.160975609756098, "percentage": 51.61, "elapsed_time": "1:23:14", "remaining_time": "1:18:02"} -{"current_steps": 1059, "total_steps": 2050, "loss": 0.1869, "lr": 2.373623936148921e-06, "epoch": 5.1658536585365855, "percentage": 51.66, "elapsed_time": "1:23:20", "remaining_time": "1:17:59"} -{"current_steps": 1060, "total_steps": 2050, "loss": 0.1426, "lr": 2.369797773655506e-06, "epoch": 5.170731707317073, "percentage": 51.71, "elapsed_time": "1:23:21", "remaining_time": "1:17:51"} -{"current_steps": 1061, "total_steps": 2050, "loss": 0.0788, "lr": 2.3659719169429866e-06, "epoch": 5.175609756097561, "percentage": 51.76, "elapsed_time": "1:23:27", "remaining_time": "1:17:47"} -{"current_steps": 1062, "total_steps": 2050, "loss": 0.0449, "lr": 2.3621463749964153e-06, "epoch": 5.180487804878049, "percentage": 51.8, "elapsed_time": "1:23:30", "remaining_time": "1:17:41"} -{"current_steps": 1063, "total_steps": 2050, "loss": 0.0264, "lr": 2.3583211568001073e-06, "epoch": 5.185365853658537, "percentage": 51.85, "elapsed_time": "1:23:34", "remaining_time": "1:17:36"} -{"current_steps": 1064, "total_steps": 2050, "loss": 0.0507, "lr": 2.3544962713376144e-06, "epoch": 5.190243902439025, "percentage": 51.9, "elapsed_time": "1:23:40", "remaining_time": "1:17:32"} -{"current_steps": 1065, "total_steps": 2050, "loss": 0.0576, "lr": 2.3506717275917095e-06, "epoch": 5.195121951219512, "percentage": 51.95, "elapsed_time": "1:23:43", "remaining_time": "1:17:25"} -{"current_steps": 1066, "total_steps": 2050, "loss": 0.0523, "lr": 2.346847534544362e-06, "epoch": 5.2, "percentage": 52.0, "elapsed_time": "1:23:49", "remaining_time": "1:17:22"} -{"current_steps": 1067, "total_steps": 2050, "loss": 0.0847, "lr": 2.3430237011767166e-06, "epoch": 5.204878048780488, "percentage": 52.05, "elapsed_time": "1:23:52", "remaining_time": "1:17:16"} -{"current_steps": 1068, "total_steps": 2050, "loss": 0.0215, "lr": 2.3392002364690762e-06, "epoch": 5.209756097560976, "percentage": 52.1, "elapsed_time": "1:23:56", "remaining_time": "1:17:10"} -{"current_steps": 1069, "total_steps": 2050, "loss": 0.1018, "lr": 2.335377149400874e-06, "epoch": 5.214634146341464, "percentage": 52.15, "elapsed_time": "1:24:01", "remaining_time": "1:17:06"} -{"current_steps": 1070, "total_steps": 2050, "loss": 0.1485, "lr": 2.3315544489506596e-06, "epoch": 5.219512195121951, "percentage": 52.2, "elapsed_time": "1:24:03", "remaining_time": "1:16:59"} -{"current_steps": 1071, "total_steps": 2050, "loss": 0.111, "lr": 2.3277321440960733e-06, "epoch": 5.224390243902439, "percentage": 52.24, "elapsed_time": "1:24:06", "remaining_time": "1:16:53"} -{"current_steps": 1072, "total_steps": 2050, "loss": 0.0267, "lr": 2.323910243813826e-06, "epoch": 5.229268292682927, "percentage": 52.29, "elapsed_time": "1:24:10", "remaining_time": "1:16:47"} -{"current_steps": 1073, "total_steps": 2050, "loss": 0.153, "lr": 2.3200887570796798e-06, "epoch": 5.234146341463415, "percentage": 52.34, "elapsed_time": "1:24:14", "remaining_time": "1:16:42"} -{"current_steps": 1074, "total_steps": 2050, "loss": 0.0968, "lr": 2.316267692868424e-06, "epoch": 5.239024390243903, "percentage": 52.39, "elapsed_time": "1:24:21", "remaining_time": "1:16:39"} -{"current_steps": 1075, "total_steps": 2050, "loss": 0.0786, "lr": 2.312447060153856e-06, "epoch": 5.2439024390243905, "percentage": 52.44, "elapsed_time": "1:24:28", "remaining_time": "1:16:36"} -{"current_steps": 1076, "total_steps": 2050, "loss": 0.0677, "lr": 2.308626867908761e-06, "epoch": 5.248780487804878, "percentage": 52.49, "elapsed_time": "1:24:33", "remaining_time": "1:16:32"} -{"current_steps": 1077, "total_steps": 2050, "loss": 0.1059, "lr": 2.3048071251048884e-06, "epoch": 5.253658536585366, "percentage": 52.54, "elapsed_time": "1:24:36", "remaining_time": "1:16:26"} -{"current_steps": 1078, "total_steps": 2050, "loss": 0.1331, "lr": 2.300987840712932e-06, "epoch": 5.258536585365854, "percentage": 52.59, "elapsed_time": "1:24:39", "remaining_time": "1:16:19"} -{"current_steps": 1079, "total_steps": 2050, "loss": 0.169, "lr": 2.297169023702511e-06, "epoch": 5.263414634146342, "percentage": 52.63, "elapsed_time": "1:24:43", "remaining_time": "1:16:14"} -{"current_steps": 1080, "total_steps": 2050, "loss": 0.1349, "lr": 2.2933506830421436e-06, "epoch": 5.2682926829268295, "percentage": 52.68, "elapsed_time": "1:24:47", "remaining_time": "1:16:09"} -{"current_steps": 1081, "total_steps": 2050, "loss": 0.0191, "lr": 2.2895328276992325e-06, "epoch": 5.273170731707317, "percentage": 52.73, "elapsed_time": "1:24:49", "remaining_time": "1:16:01"} -{"current_steps": 1082, "total_steps": 2050, "loss": 0.1961, "lr": 2.28571546664004e-06, "epoch": 5.278048780487805, "percentage": 52.78, "elapsed_time": "1:24:56", "remaining_time": "1:15:59"} -{"current_steps": 1083, "total_steps": 2050, "loss": 0.02, "lr": 2.281898608829665e-06, "epoch": 5.282926829268293, "percentage": 52.83, "elapsed_time": "1:24:58", "remaining_time": "1:15:52"} -{"current_steps": 1084, "total_steps": 2050, "loss": 0.0763, "lr": 2.2780822632320273e-06, "epoch": 5.287804878048781, "percentage": 52.88, "elapsed_time": "1:25:05", "remaining_time": "1:15:49"} -{"current_steps": 1085, "total_steps": 2050, "loss": 0.0403, "lr": 2.2742664388098435e-06, "epoch": 5.2926829268292686, "percentage": 52.93, "elapsed_time": "1:25:11", "remaining_time": "1:15:45"} -{"current_steps": 1086, "total_steps": 2050, "loss": 0.0982, "lr": 2.270451144524605e-06, "epoch": 5.297560975609756, "percentage": 52.98, "elapsed_time": "1:25:17", "remaining_time": "1:15:42"} -{"current_steps": 1087, "total_steps": 2050, "loss": 0.09, "lr": 2.266636389336559e-06, "epoch": 5.302439024390244, "percentage": 53.02, "elapsed_time": "1:25:22", "remaining_time": "1:15:37"} -{"current_steps": 1088, "total_steps": 2050, "loss": 0.0267, "lr": 2.262822182204686e-06, "epoch": 5.307317073170732, "percentage": 53.07, "elapsed_time": "1:25:25", "remaining_time": "1:15:32"} -{"current_steps": 1089, "total_steps": 2050, "loss": 0.0295, "lr": 2.2590085320866798e-06, "epoch": 5.31219512195122, "percentage": 53.12, "elapsed_time": "1:25:27", "remaining_time": "1:15:24"} -{"current_steps": 1090, "total_steps": 2050, "loss": 0.0261, "lr": 2.255195447938927e-06, "epoch": 5.317073170731708, "percentage": 53.17, "elapsed_time": "1:25:30", "remaining_time": "1:15:18"} -{"current_steps": 1091, "total_steps": 2050, "loss": 0.0936, "lr": 2.251382938716482e-06, "epoch": 5.321951219512195, "percentage": 53.22, "elapsed_time": "1:25:36", "remaining_time": "1:15:14"} -{"current_steps": 1092, "total_steps": 2050, "loss": 0.0426, "lr": 2.2475710133730533e-06, "epoch": 5.326829268292683, "percentage": 53.27, "elapsed_time": "1:25:37", "remaining_time": "1:15:07"} -{"current_steps": 1093, "total_steps": 2050, "loss": 0.0799, "lr": 2.243759680860975e-06, "epoch": 5.331707317073171, "percentage": 53.32, "elapsed_time": "1:25:40", "remaining_time": "1:15:01"} -{"current_steps": 1094, "total_steps": 2050, "loss": 0.0906, "lr": 2.2399489501311896e-06, "epoch": 5.336585365853659, "percentage": 53.37, "elapsed_time": "1:25:47", "remaining_time": "1:14:57"} -{"current_steps": 1095, "total_steps": 2050, "loss": 0.2152, "lr": 2.2361388301332265e-06, "epoch": 5.341463414634147, "percentage": 53.41, "elapsed_time": "1:25:51", "remaining_time": "1:14:52"} -{"current_steps": 1096, "total_steps": 2050, "loss": 0.0359, "lr": 2.2323293298151817e-06, "epoch": 5.3463414634146345, "percentage": 53.46, "elapsed_time": "1:25:57", "remaining_time": "1:14:48"} -{"current_steps": 1097, "total_steps": 2050, "loss": 0.0368, "lr": 2.2285204581236937e-06, "epoch": 5.351219512195122, "percentage": 53.51, "elapsed_time": "1:25:59", "remaining_time": "1:14:42"} -{"current_steps": 1098, "total_steps": 2050, "loss": 0.0426, "lr": 2.2247122240039268e-06, "epoch": 5.35609756097561, "percentage": 53.56, "elapsed_time": "1:26:00", "remaining_time": "1:14:34"} -{"current_steps": 1099, "total_steps": 2050, "loss": 0.0223, "lr": 2.2209046363995464e-06, "epoch": 5.360975609756098, "percentage": 53.61, "elapsed_time": "1:26:02", "remaining_time": "1:14:27"} -{"current_steps": 1100, "total_steps": 2050, "loss": 0.1276, "lr": 2.217097704252701e-06, "epoch": 5.365853658536586, "percentage": 53.66, "elapsed_time": "1:26:05", "remaining_time": "1:14:21"} -{"current_steps": 1101, "total_steps": 2050, "loss": 0.0639, "lr": 2.2132914365039993e-06, "epoch": 5.3707317073170735, "percentage": 53.71, "elapsed_time": "1:26:12", "remaining_time": "1:14:18"} -{"current_steps": 1102, "total_steps": 2050, "loss": 0.0166, "lr": 2.2094858420924882e-06, "epoch": 5.375609756097561, "percentage": 53.76, "elapsed_time": "1:26:14", "remaining_time": "1:14:11"} -{"current_steps": 1103, "total_steps": 2050, "loss": 0.144, "lr": 2.205680929955635e-06, "epoch": 5.380487804878049, "percentage": 53.8, "elapsed_time": "1:26:16", "remaining_time": "1:14:04"} -{"current_steps": 1104, "total_steps": 2050, "loss": 0.1004, "lr": 2.201876709029305e-06, "epoch": 5.385365853658537, "percentage": 53.85, "elapsed_time": "1:26:22", "remaining_time": "1:14:00"} -{"current_steps": 1105, "total_steps": 2050, "loss": 0.0453, "lr": 2.198073188247738e-06, "epoch": 5.390243902439025, "percentage": 53.9, "elapsed_time": "1:26:24", "remaining_time": "1:13:54"} -{"current_steps": 1106, "total_steps": 2050, "loss": 0.0195, "lr": 2.1942703765435317e-06, "epoch": 5.3951219512195125, "percentage": 53.95, "elapsed_time": "1:26:26", "remaining_time": "1:13:46"} -{"current_steps": 1107, "total_steps": 2050, "loss": 0.1512, "lr": 2.190468282847617e-06, "epoch": 5.4, "percentage": 54.0, "elapsed_time": "1:26:29", "remaining_time": "1:13:41"} -{"current_steps": 1108, "total_steps": 2050, "loss": 0.2572, "lr": 2.186666916089239e-06, "epoch": 5.404878048780488, "percentage": 54.05, "elapsed_time": "1:26:32", "remaining_time": "1:13:34"} -{"current_steps": 1109, "total_steps": 2050, "loss": 0.0536, "lr": 2.1828662851959377e-06, "epoch": 5.409756097560976, "percentage": 54.1, "elapsed_time": "1:26:36", "remaining_time": "1:13:29"} -{"current_steps": 1110, "total_steps": 2050, "loss": 0.0778, "lr": 2.1790663990935203e-06, "epoch": 5.414634146341464, "percentage": 54.15, "elapsed_time": "1:26:39", "remaining_time": "1:13:23"} -{"current_steps": 1111, "total_steps": 2050, "loss": 0.0558, "lr": 2.1752672667060488e-06, "epoch": 5.419512195121952, "percentage": 54.2, "elapsed_time": "1:26:43", "remaining_time": "1:13:17"} -{"current_steps": 1112, "total_steps": 2050, "loss": 0.041, "lr": 2.1714688969558146e-06, "epoch": 5.424390243902439, "percentage": 54.24, "elapsed_time": "1:26:45", "remaining_time": "1:13:10"} -{"current_steps": 1113, "total_steps": 2050, "loss": 0.1644, "lr": 2.167671298763316e-06, "epoch": 5.429268292682927, "percentage": 54.29, "elapsed_time": "1:26:49", "remaining_time": "1:13:05"} -{"current_steps": 1114, "total_steps": 2050, "loss": 0.1587, "lr": 2.1638744810472414e-06, "epoch": 5.434146341463415, "percentage": 54.34, "elapsed_time": "1:26:52", "remaining_time": "1:13:00"} -{"current_steps": 1115, "total_steps": 2050, "loss": 0.0605, "lr": 2.1600784527244445e-06, "epoch": 5.439024390243903, "percentage": 54.39, "elapsed_time": "1:26:55", "remaining_time": "1:12:53"} -{"current_steps": 1116, "total_steps": 2050, "loss": 0.1897, "lr": 2.1562832227099266e-06, "epoch": 5.443902439024391, "percentage": 54.44, "elapsed_time": "1:26:58", "remaining_time": "1:12:47"} -{"current_steps": 1117, "total_steps": 2050, "loss": 0.1525, "lr": 2.152488799916814e-06, "epoch": 5.4487804878048784, "percentage": 54.49, "elapsed_time": "1:27:00", "remaining_time": "1:12:40"} -{"current_steps": 1118, "total_steps": 2050, "loss": 0.189, "lr": 2.148695193256336e-06, "epoch": 5.453658536585366, "percentage": 54.54, "elapsed_time": "1:27:02", "remaining_time": "1:12:33"} -{"current_steps": 1119, "total_steps": 2050, "loss": 0.095, "lr": 2.1449024116378064e-06, "epoch": 5.458536585365854, "percentage": 54.59, "elapsed_time": "1:27:06", "remaining_time": "1:12:28"} -{"current_steps": 1120, "total_steps": 2050, "loss": 0.0432, "lr": 2.1411104639686013e-06, "epoch": 5.463414634146342, "percentage": 54.63, "elapsed_time": "1:27:12", "remaining_time": "1:12:24"} -{"current_steps": 1121, "total_steps": 2050, "loss": 0.0954, "lr": 2.137319359154138e-06, "epoch": 5.46829268292683, "percentage": 54.68, "elapsed_time": "1:27:13", "remaining_time": "1:12:16"} -{"current_steps": 1122, "total_steps": 2050, "loss": 0.0362, "lr": 2.133529106097853e-06, "epoch": 5.473170731707317, "percentage": 54.73, "elapsed_time": "1:27:17", "remaining_time": "1:12:11"} -{"current_steps": 1123, "total_steps": 2050, "loss": 0.0875, "lr": 2.1297397137011862e-06, "epoch": 5.478048780487805, "percentage": 54.78, "elapsed_time": "1:27:20", "remaining_time": "1:12:05"} -{"current_steps": 1124, "total_steps": 2050, "loss": 0.0758, "lr": 2.125951190863551e-06, "epoch": 5.482926829268292, "percentage": 54.83, "elapsed_time": "1:27:22", "remaining_time": "1:11:58"} -{"current_steps": 1125, "total_steps": 2050, "loss": 0.0605, "lr": 2.1221635464823237e-06, "epoch": 5.487804878048781, "percentage": 54.88, "elapsed_time": "1:27:25", "remaining_time": "1:11:52"} -{"current_steps": 1126, "total_steps": 2050, "loss": 0.2403, "lr": 2.1183767894528135e-06, "epoch": 5.492682926829268, "percentage": 54.93, "elapsed_time": "1:27:29", "remaining_time": "1:11:48"} -{"current_steps": 1127, "total_steps": 2050, "loss": 0.0223, "lr": 2.114590928668249e-06, "epoch": 5.4975609756097565, "percentage": 54.98, "elapsed_time": "1:27:33", "remaining_time": "1:11:42"} -{"current_steps": 1128, "total_steps": 2050, "loss": 0.0617, "lr": 2.1108059730197517e-06, "epoch": 5.5024390243902435, "percentage": 55.02, "elapsed_time": "1:27:37", "remaining_time": "1:11:37"} -{"current_steps": 1129, "total_steps": 2050, "loss": 0.043, "lr": 2.1070219313963173e-06, "epoch": 5.507317073170732, "percentage": 55.07, "elapsed_time": "1:27:39", "remaining_time": "1:11:30"} -{"current_steps": 1130, "total_steps": 2050, "loss": 0.0595, "lr": 2.1032388126847967e-06, "epoch": 5.512195121951219, "percentage": 55.12, "elapsed_time": "1:27:45", "remaining_time": "1:11:26"} -{"current_steps": 1131, "total_steps": 2050, "loss": 0.0186, "lr": 2.099456625769872e-06, "epoch": 5.517073170731708, "percentage": 55.17, "elapsed_time": "1:27:47", "remaining_time": "1:11:20"} -{"current_steps": 1132, "total_steps": 2050, "loss": 0.0616, "lr": 2.0956753795340376e-06, "epoch": 5.521951219512195, "percentage": 55.22, "elapsed_time": "1:27:48", "remaining_time": "1:11:12"} -{"current_steps": 1133, "total_steps": 2050, "loss": 0.1895, "lr": 2.091895082857578e-06, "epoch": 5.526829268292683, "percentage": 55.27, "elapsed_time": "1:27:52", "remaining_time": "1:11:07"} -{"current_steps": 1134, "total_steps": 2050, "loss": 0.0484, "lr": 2.0881157446185474e-06, "epoch": 5.53170731707317, "percentage": 55.32, "elapsed_time": "1:27:54", "remaining_time": "1:11:00"} -{"current_steps": 1135, "total_steps": 2050, "loss": 0.037, "lr": 2.0843373736927506e-06, "epoch": 5.536585365853659, "percentage": 55.37, "elapsed_time": "1:27:55", "remaining_time": "1:10:52"} -{"current_steps": 1136, "total_steps": 2050, "loss": 0.0227, "lr": 2.08055997895372e-06, "epoch": 5.541463414634146, "percentage": 55.41, "elapsed_time": "1:27:59", "remaining_time": "1:10:47"} -{"current_steps": 1137, "total_steps": 2050, "loss": 0.0296, "lr": 2.0767835692726944e-06, "epoch": 5.546341463414635, "percentage": 55.46, "elapsed_time": "1:28:01", "remaining_time": "1:10:41"} -{"current_steps": 1138, "total_steps": 2050, "loss": 0.16, "lr": 2.0730081535186e-06, "epoch": 5.5512195121951216, "percentage": 55.51, "elapsed_time": "1:28:05", "remaining_time": "1:10:36"} -{"current_steps": 1139, "total_steps": 2050, "loss": 0.0725, "lr": 2.06923374055803e-06, "epoch": 5.55609756097561, "percentage": 55.56, "elapsed_time": "1:28:07", "remaining_time": "1:10:29"} -{"current_steps": 1140, "total_steps": 2050, "loss": 0.0198, "lr": 2.0654603392552193e-06, "epoch": 5.560975609756097, "percentage": 55.61, "elapsed_time": "1:28:09", "remaining_time": "1:10:22"} -{"current_steps": 1141, "total_steps": 2050, "loss": 0.1144, "lr": 2.0616879584720305e-06, "epoch": 5.565853658536585, "percentage": 55.66, "elapsed_time": "1:28:14", "remaining_time": "1:10:18"} -{"current_steps": 1142, "total_steps": 2050, "loss": 0.0491, "lr": 2.057916607067928e-06, "epoch": 5.570731707317073, "percentage": 55.71, "elapsed_time": "1:28:21", "remaining_time": "1:10:15"} -{"current_steps": 1143, "total_steps": 2050, "loss": 0.035, "lr": 2.054146293899957e-06, "epoch": 5.575609756097561, "percentage": 55.76, "elapsed_time": "1:28:25", "remaining_time": "1:10:09"} -{"current_steps": 1144, "total_steps": 2050, "loss": 0.0639, "lr": 2.0503770278227274e-06, "epoch": 5.580487804878048, "percentage": 55.8, "elapsed_time": "1:28:31", "remaining_time": "1:10:06"} -{"current_steps": 1145, "total_steps": 2050, "loss": 0.0258, "lr": 2.0466088176883876e-06, "epoch": 5.585365853658536, "percentage": 55.85, "elapsed_time": "1:28:33", "remaining_time": "1:09:59"} -{"current_steps": 1146, "total_steps": 2050, "loss": 0.0634, "lr": 2.042841672346608e-06, "epoch": 5.590243902439024, "percentage": 55.9, "elapsed_time": "1:28:35", "remaining_time": "1:09:52"} -{"current_steps": 1147, "total_steps": 2050, "loss": 0.0464, "lr": 2.039075600644557e-06, "epoch": 5.595121951219512, "percentage": 55.95, "elapsed_time": "1:28:40", "remaining_time": "1:09:48"} -{"current_steps": 1148, "total_steps": 2050, "loss": 0.0829, "lr": 2.0353106114268824e-06, "epoch": 5.6, "percentage": 56.0, "elapsed_time": "1:28:43", "remaining_time": "1:09:42"} -{"current_steps": 1149, "total_steps": 2050, "loss": 0.0321, "lr": 2.031546713535688e-06, "epoch": 5.6048780487804875, "percentage": 56.05, "elapsed_time": "1:28:46", "remaining_time": "1:09:36"} -{"current_steps": 1150, "total_steps": 2050, "loss": 0.05, "lr": 2.027783915810518e-06, "epoch": 5.609756097560975, "percentage": 56.1, "elapsed_time": "1:28:48", "remaining_time": "1:09:30"} -{"current_steps": 1151, "total_steps": 2050, "loss": 0.1984, "lr": 2.024022227088329e-06, "epoch": 5.614634146341463, "percentage": 56.15, "elapsed_time": "1:28:52", "remaining_time": "1:09:24"} -{"current_steps": 1152, "total_steps": 2050, "loss": 0.1673, "lr": 2.020261656203476e-06, "epoch": 5.619512195121951, "percentage": 56.2, "elapsed_time": "1:28:55", "remaining_time": "1:09:18"} -{"current_steps": 1153, "total_steps": 2050, "loss": 0.1106, "lr": 2.016502211987687e-06, "epoch": 5.624390243902439, "percentage": 56.24, "elapsed_time": "1:28:58", "remaining_time": "1:09:13"} -{"current_steps": 1154, "total_steps": 2050, "loss": 0.0374, "lr": 2.0127439032700446e-06, "epoch": 5.6292682926829265, "percentage": 56.29, "elapsed_time": "1:29:02", "remaining_time": "1:09:07"} -{"current_steps": 1155, "total_steps": 2050, "loss": 0.0674, "lr": 2.0089867388769664e-06, "epoch": 5.634146341463414, "percentage": 56.34, "elapsed_time": "1:29:05", "remaining_time": "1:09:01"} -{"current_steps": 1156, "total_steps": 2050, "loss": 0.0365, "lr": 2.0052307276321793e-06, "epoch": 5.639024390243902, "percentage": 56.39, "elapsed_time": "1:29:07", "remaining_time": "1:08:55"} -{"current_steps": 1157, "total_steps": 2050, "loss": 0.0758, "lr": 2.001475878356703e-06, "epoch": 5.64390243902439, "percentage": 56.44, "elapsed_time": "1:29:09", "remaining_time": "1:08:49"} -{"current_steps": 1158, "total_steps": 2050, "loss": 0.176, "lr": 1.99772219986883e-06, "epoch": 5.648780487804878, "percentage": 56.49, "elapsed_time": "1:29:12", "remaining_time": "1:08:42"} -{"current_steps": 1159, "total_steps": 2050, "loss": 0.0491, "lr": 1.9939697009841024e-06, "epoch": 5.6536585365853655, "percentage": 56.54, "elapsed_time": "1:29:16", "remaining_time": "1:08:37"} -{"current_steps": 1160, "total_steps": 2050, "loss": 0.0741, "lr": 1.990218390515291e-06, "epoch": 5.658536585365853, "percentage": 56.59, "elapsed_time": "1:29:19", "remaining_time": "1:08:31"} -{"current_steps": 1161, "total_steps": 2050, "loss": 0.0826, "lr": 1.9864682772723757e-06, "epoch": 5.663414634146341, "percentage": 56.63, "elapsed_time": "1:29:21", "remaining_time": "1:08:25"} -{"current_steps": 1162, "total_steps": 2050, "loss": 0.0378, "lr": 1.9827193700625274e-06, "epoch": 5.668292682926829, "percentage": 56.68, "elapsed_time": "1:29:27", "remaining_time": "1:08:21"} -{"current_steps": 1163, "total_steps": 2050, "loss": 0.2466, "lr": 1.978971677690081e-06, "epoch": 5.673170731707317, "percentage": 56.73, "elapsed_time": "1:29:30", "remaining_time": "1:08:15"} -{"current_steps": 1164, "total_steps": 2050, "loss": 0.0205, "lr": 1.97522520895652e-06, "epoch": 5.678048780487805, "percentage": 56.78, "elapsed_time": "1:29:31", "remaining_time": "1:08:08"} -{"current_steps": 1165, "total_steps": 2050, "loss": 0.0998, "lr": 1.971479972660454e-06, "epoch": 5.682926829268292, "percentage": 56.83, "elapsed_time": "1:29:37", "remaining_time": "1:08:05"} -{"current_steps": 1166, "total_steps": 2050, "loss": 0.0217, "lr": 1.967735977597598e-06, "epoch": 5.68780487804878, "percentage": 56.88, "elapsed_time": "1:29:39", "remaining_time": "1:07:58"} -{"current_steps": 1167, "total_steps": 2050, "loss": 0.048, "lr": 1.9639932325607538e-06, "epoch": 5.692682926829268, "percentage": 56.93, "elapsed_time": "1:29:45", "remaining_time": "1:07:54"} -{"current_steps": 1168, "total_steps": 2050, "loss": 0.0302, "lr": 1.9602517463397845e-06, "epoch": 5.697560975609756, "percentage": 56.98, "elapsed_time": "1:29:47", "remaining_time": "1:07:48"} -{"current_steps": 1169, "total_steps": 2050, "loss": 0.0724, "lr": 1.9565115277215978e-06, "epoch": 5.702439024390244, "percentage": 57.02, "elapsed_time": "1:29:53", "remaining_time": "1:07:44"} -{"current_steps": 1170, "total_steps": 2050, "loss": 0.0464, "lr": 1.952772585490127e-06, "epoch": 5.7073170731707314, "percentage": 57.07, "elapsed_time": "1:29:57", "remaining_time": "1:07:39"} -{"current_steps": 1171, "total_steps": 2050, "loss": 0.0239, "lr": 1.9490349284263036e-06, "epoch": 5.712195121951219, "percentage": 57.12, "elapsed_time": "1:30:00", "remaining_time": "1:07:33"} -{"current_steps": 1172, "total_steps": 2050, "loss": 0.0719, "lr": 1.9452985653080443e-06, "epoch": 5.717073170731707, "percentage": 57.17, "elapsed_time": "1:30:03", "remaining_time": "1:07:27"} -{"current_steps": 1173, "total_steps": 2050, "loss": 0.0408, "lr": 1.9415635049102245e-06, "epoch": 5.721951219512195, "percentage": 57.22, "elapsed_time": "1:30:05", "remaining_time": "1:07:21"} -{"current_steps": 1174, "total_steps": 2050, "loss": 0.2049, "lr": 1.937829756004662e-06, "epoch": 5.726829268292683, "percentage": 57.27, "elapsed_time": "1:30:06", "remaining_time": "1:07:14"} -{"current_steps": 1175, "total_steps": 2050, "loss": 0.0636, "lr": 1.9340973273600944e-06, "epoch": 5.7317073170731705, "percentage": 57.32, "elapsed_time": "1:30:08", "remaining_time": "1:07:07"} -{"current_steps": 1176, "total_steps": 2050, "loss": 0.1252, "lr": 1.930366227742157e-06, "epoch": 5.736585365853658, "percentage": 57.37, "elapsed_time": "1:30:11", "remaining_time": "1:07:01"} -{"current_steps": 1177, "total_steps": 2050, "loss": 0.0687, "lr": 1.9266364659133653e-06, "epoch": 5.741463414634146, "percentage": 57.41, "elapsed_time": "1:30:13", "remaining_time": "1:06:55"} -{"current_steps": 1178, "total_steps": 2050, "loss": 0.0333, "lr": 1.922908050633093e-06, "epoch": 5.746341463414634, "percentage": 57.46, "elapsed_time": "1:30:17", "remaining_time": "1:06:50"} -{"current_steps": 1179, "total_steps": 2050, "loss": 0.0792, "lr": 1.919180990657551e-06, "epoch": 5.751219512195122, "percentage": 57.51, "elapsed_time": "1:30:19", "remaining_time": "1:06:43"} -{"current_steps": 1180, "total_steps": 2050, "loss": 0.069, "lr": 1.9154552947397668e-06, "epoch": 5.7560975609756095, "percentage": 57.56, "elapsed_time": "1:30:25", "remaining_time": "1:06:40"} -{"current_steps": 1181, "total_steps": 2050, "loss": 0.115, "lr": 1.9117309716295658e-06, "epoch": 5.760975609756097, "percentage": 57.61, "elapsed_time": "1:30:27", "remaining_time": "1:06:33"} -{"current_steps": 1182, "total_steps": 2050, "loss": 0.0537, "lr": 1.9080080300735478e-06, "epoch": 5.765853658536585, "percentage": 57.66, "elapsed_time": "1:30:29", "remaining_time": "1:06:27"} -{"current_steps": 1183, "total_steps": 2050, "loss": 0.0817, "lr": 1.9042864788150695e-06, "epoch": 5.770731707317073, "percentage": 57.71, "elapsed_time": "1:30:34", "remaining_time": "1:06:22"} -{"current_steps": 1184, "total_steps": 2050, "loss": 0.0289, "lr": 1.9005663265942206e-06, "epoch": 5.775609756097561, "percentage": 57.76, "elapsed_time": "1:30:36", "remaining_time": "1:06:16"} -{"current_steps": 1185, "total_steps": 2050, "loss": 0.0357, "lr": 1.8968475821478066e-06, "epoch": 5.780487804878049, "percentage": 57.8, "elapsed_time": "1:30:39", "remaining_time": "1:06:10"} -{"current_steps": 1186, "total_steps": 2050, "loss": 0.0584, "lr": 1.8931302542093274e-06, "epoch": 5.785365853658536, "percentage": 57.85, "elapsed_time": "1:30:41", "remaining_time": "1:06:04"} -{"current_steps": 1187, "total_steps": 2050, "loss": 0.0324, "lr": 1.8894143515089539e-06, "epoch": 5.790243902439024, "percentage": 57.9, "elapsed_time": "1:30:44", "remaining_time": "1:05:58"} -{"current_steps": 1188, "total_steps": 2050, "loss": 0.0338, "lr": 1.8856998827735118e-06, "epoch": 5.795121951219512, "percentage": 57.95, "elapsed_time": "1:30:48", "remaining_time": "1:05:53"} -{"current_steps": 1189, "total_steps": 2050, "loss": 0.1706, "lr": 1.8819868567264588e-06, "epoch": 5.8, "percentage": 58.0, "elapsed_time": "1:30:50", "remaining_time": "1:05:46"} -{"current_steps": 1190, "total_steps": 2050, "loss": 0.0463, "lr": 1.8782752820878636e-06, "epoch": 5.804878048780488, "percentage": 58.05, "elapsed_time": "1:30:54", "remaining_time": "1:05:41"} -{"current_steps": 1191, "total_steps": 2050, "loss": 0.1188, "lr": 1.8745651675743876e-06, "epoch": 5.809756097560975, "percentage": 58.1, "elapsed_time": "1:30:55", "remaining_time": "1:05:35"} -{"current_steps": 1192, "total_steps": 2050, "loss": 0.0984, "lr": 1.870856521899261e-06, "epoch": 5.814634146341463, "percentage": 58.15, "elapsed_time": "1:30:59", "remaining_time": "1:05:29"} -{"current_steps": 1193, "total_steps": 2050, "loss": 0.0195, "lr": 1.867149353772267e-06, "epoch": 5.819512195121951, "percentage": 58.2, "elapsed_time": "1:31:00", "remaining_time": "1:05:22"} -{"current_steps": 1194, "total_steps": 2050, "loss": 0.0236, "lr": 1.863443671899717e-06, "epoch": 5.824390243902439, "percentage": 58.24, "elapsed_time": "1:31:03", "remaining_time": "1:05:16"} -{"current_steps": 1195, "total_steps": 2050, "loss": 0.1108, "lr": 1.8597394849844319e-06, "epoch": 5.829268292682927, "percentage": 58.29, "elapsed_time": "1:31:09", "remaining_time": "1:05:13"} -{"current_steps": 1196, "total_steps": 2050, "loss": 0.0388, "lr": 1.8560368017257229e-06, "epoch": 5.8341463414634145, "percentage": 58.34, "elapsed_time": "1:31:13", "remaining_time": "1:05:08"} -{"current_steps": 1197, "total_steps": 2050, "loss": 0.3098, "lr": 1.8523356308193696e-06, "epoch": 5.839024390243902, "percentage": 58.39, "elapsed_time": "1:31:16", "remaining_time": "1:05:02"} -{"current_steps": 1198, "total_steps": 2050, "loss": 0.0775, "lr": 1.8486359809575977e-06, "epoch": 5.84390243902439, "percentage": 58.44, "elapsed_time": "1:31:19", "remaining_time": "1:04:56"} -{"current_steps": 1199, "total_steps": 2050, "loss": 0.1222, "lr": 1.8449378608290638e-06, "epoch": 5.848780487804878, "percentage": 58.49, "elapsed_time": "1:31:22", "remaining_time": "1:04:51"} -{"current_steps": 1200, "total_steps": 2050, "loss": 0.1146, "lr": 1.8412412791188306e-06, "epoch": 5.853658536585366, "percentage": 58.54, "elapsed_time": "1:31:24", "remaining_time": "1:04:44"} -{"current_steps": 1201, "total_steps": 2050, "loss": 0.1113, "lr": 1.8375462445083464e-06, "epoch": 5.8585365853658535, "percentage": 58.59, "elapsed_time": "1:31:25", "remaining_time": "1:04:37"} -{"current_steps": 1202, "total_steps": 2050, "loss": 0.0416, "lr": 1.8338527656754285e-06, "epoch": 5.863414634146341, "percentage": 58.63, "elapsed_time": "1:31:27", "remaining_time": "1:04:31"} -{"current_steps": 1203, "total_steps": 2050, "loss": 0.0613, "lr": 1.830160851294239e-06, "epoch": 5.868292682926829, "percentage": 58.68, "elapsed_time": "1:31:30", "remaining_time": "1:04:25"} -{"current_steps": 1204, "total_steps": 2050, "loss": 0.197, "lr": 1.8264705100352662e-06, "epoch": 5.873170731707317, "percentage": 58.73, "elapsed_time": "1:31:33", "remaining_time": "1:04:20"} -{"current_steps": 1205, "total_steps": 2050, "loss": 0.0821, "lr": 1.8227817505653045e-06, "epoch": 5.878048780487805, "percentage": 58.78, "elapsed_time": "1:31:36", "remaining_time": "1:04:14"} -{"current_steps": 1206, "total_steps": 2050, "loss": 0.1246, "lr": 1.8190945815474323e-06, "epoch": 5.882926829268293, "percentage": 58.83, "elapsed_time": "1:31:39", "remaining_time": "1:04:08"} -{"current_steps": 1207, "total_steps": 2050, "loss": 0.0703, "lr": 1.8154090116409934e-06, "epoch": 5.88780487804878, "percentage": 58.88, "elapsed_time": "1:31:41", "remaining_time": "1:04:02"} -{"current_steps": 1208, "total_steps": 2050, "loss": 0.1078, "lr": 1.811725049501577e-06, "epoch": 5.892682926829268, "percentage": 58.93, "elapsed_time": "1:31:42", "remaining_time": "1:03:55"} -{"current_steps": 1209, "total_steps": 2050, "loss": 0.1648, "lr": 1.8080427037809941e-06, "epoch": 5.897560975609756, "percentage": 58.98, "elapsed_time": "1:31:45", "remaining_time": "1:03:49"} -{"current_steps": 1210, "total_steps": 2050, "loss": 0.061, "lr": 1.8043619831272623e-06, "epoch": 5.902439024390244, "percentage": 59.02, "elapsed_time": "1:31:48", "remaining_time": "1:03:44"} -{"current_steps": 1211, "total_steps": 2050, "loss": 0.1863, "lr": 1.8006828961845807e-06, "epoch": 5.907317073170732, "percentage": 59.07, "elapsed_time": "1:31:53", "remaining_time": "1:03:39"} -{"current_steps": 1212, "total_steps": 2050, "loss": 0.2387, "lr": 1.7970054515933124e-06, "epoch": 5.912195121951219, "percentage": 59.12, "elapsed_time": "1:31:54", "remaining_time": "1:03:33"} -{"current_steps": 1213, "total_steps": 2050, "loss": 0.2053, "lr": 1.793329657989964e-06, "epoch": 5.917073170731707, "percentage": 59.17, "elapsed_time": "1:31:57", "remaining_time": "1:03:27"} -{"current_steps": 1214, "total_steps": 2050, "loss": 0.026, "lr": 1.7896555240071627e-06, "epoch": 5.921951219512195, "percentage": 59.22, "elapsed_time": "1:32:01", "remaining_time": "1:03:22"} -{"current_steps": 1215, "total_steps": 2050, "loss": 0.0735, "lr": 1.7859830582736406e-06, "epoch": 5.926829268292683, "percentage": 59.27, "elapsed_time": "1:32:08", "remaining_time": "1:03:19"} -{"current_steps": 1216, "total_steps": 2050, "loss": 0.0586, "lr": 1.782312269414211e-06, "epoch": 5.931707317073171, "percentage": 59.32, "elapsed_time": "1:32:10", "remaining_time": "1:03:13"} -{"current_steps": 1217, "total_steps": 2050, "loss": 0.3086, "lr": 1.7786431660497474e-06, "epoch": 5.9365853658536585, "percentage": 59.37, "elapsed_time": "1:32:14", "remaining_time": "1:03:07"} -{"current_steps": 1218, "total_steps": 2050, "loss": 0.0978, "lr": 1.7749757567971678e-06, "epoch": 5.941463414634146, "percentage": 59.41, "elapsed_time": "1:32:17", "remaining_time": "1:03:02"} -{"current_steps": 1219, "total_steps": 2050, "loss": 0.0976, "lr": 1.7713100502694091e-06, "epoch": 5.946341463414634, "percentage": 59.46, "elapsed_time": "1:32:23", "remaining_time": "1:02:59"} -{"current_steps": 1220, "total_steps": 2050, "loss": 0.02, "lr": 1.7676460550754104e-06, "epoch": 5.951219512195122, "percentage": 59.51, "elapsed_time": "1:32:25", "remaining_time": "1:02:53"} -{"current_steps": 1221, "total_steps": 2050, "loss": 0.0741, "lr": 1.7639837798200923e-06, "epoch": 5.95609756097561, "percentage": 59.56, "elapsed_time": "1:32:27", "remaining_time": "1:02:46"} -{"current_steps": 1222, "total_steps": 2050, "loss": 0.0542, "lr": 1.7603232331043346e-06, "epoch": 5.9609756097560975, "percentage": 59.61, "elapsed_time": "1:32:32", "remaining_time": "1:02:42"} -{"current_steps": 1223, "total_steps": 2050, "loss": 0.3552, "lr": 1.7566644235249591e-06, "epoch": 5.965853658536585, "percentage": 59.66, "elapsed_time": "1:32:35", "remaining_time": "1:02:36"} -{"current_steps": 1224, "total_steps": 2050, "loss": 0.0405, "lr": 1.7530073596747072e-06, "epoch": 5.970731707317073, "percentage": 59.71, "elapsed_time": "1:32:41", "remaining_time": "1:02:33"} -{"current_steps": 1225, "total_steps": 2050, "loss": 0.0178, "lr": 1.74935205014222e-06, "epoch": 5.975609756097561, "percentage": 59.76, "elapsed_time": "1:32:43", "remaining_time": "1:02:26"} -{"current_steps": 1226, "total_steps": 2050, "loss": 0.0264, "lr": 1.7456985035120194e-06, "epoch": 5.980487804878049, "percentage": 59.8, "elapsed_time": "1:32:45", "remaining_time": "1:02:20"} -{"current_steps": 1227, "total_steps": 2050, "loss": 0.0555, "lr": 1.7420467283644877e-06, "epoch": 5.985365853658537, "percentage": 59.85, "elapsed_time": "1:32:48", "remaining_time": "1:02:15"} -{"current_steps": 1228, "total_steps": 2050, "loss": 0.0546, "lr": 1.738396733275844e-06, "epoch": 5.990243902439024, "percentage": 59.9, "elapsed_time": "1:32:51", "remaining_time": "1:02:09"} -{"current_steps": 1229, "total_steps": 2050, "loss": 0.1967, "lr": 1.7347485268181309e-06, "epoch": 5.995121951219512, "percentage": 59.95, "elapsed_time": "1:32:53", "remaining_time": "1:02:03"} -{"current_steps": 1230, "total_steps": 2050, "loss": 0.0491, "lr": 1.7311021175591868e-06, "epoch": 6.0, "percentage": 60.0, "elapsed_time": "1:32:59", "remaining_time": "1:01:59"} -{"current_steps": 1231, "total_steps": 2050, "loss": 0.0359, "lr": 1.7274575140626318e-06, "epoch": 6.004878048780488, "percentage": 60.05, "elapsed_time": "1:36:29", "remaining_time": "1:04:12"} -{"current_steps": 1232, "total_steps": 2050, "loss": 0.0585, "lr": 1.7238147248878444e-06, "epoch": 6.009756097560976, "percentage": 60.1, "elapsed_time": "1:36:33", "remaining_time": "1:04:06"} -{"current_steps": 1233, "total_steps": 2050, "loss": 0.0188, "lr": 1.7201737585899415e-06, "epoch": 6.014634146341463, "percentage": 60.15, "elapsed_time": "1:36:36", "remaining_time": "1:04:00"} -{"current_steps": 1234, "total_steps": 2050, "loss": 0.0484, "lr": 1.7165346237197594e-06, "epoch": 6.019512195121951, "percentage": 60.2, "elapsed_time": "1:36:40", "remaining_time": "1:03:55"} -{"current_steps": 1235, "total_steps": 2050, "loss": 0.0776, "lr": 1.7128973288238344e-06, "epoch": 6.024390243902439, "percentage": 60.24, "elapsed_time": "1:36:41", "remaining_time": "1:03:48"} -{"current_steps": 1236, "total_steps": 2050, "loss": 0.0338, "lr": 1.709261882444379e-06, "epoch": 6.029268292682927, "percentage": 60.29, "elapsed_time": "1:36:48", "remaining_time": "1:03:45"} -{"current_steps": 1237, "total_steps": 2050, "loss": 0.0385, "lr": 1.705628293119268e-06, "epoch": 6.034146341463415, "percentage": 60.34, "elapsed_time": "1:36:56", "remaining_time": "1:03:42"} -{"current_steps": 1238, "total_steps": 2050, "loss": 0.2601, "lr": 1.701996569382011e-06, "epoch": 6.0390243902439025, "percentage": 60.39, "elapsed_time": "1:36:57", "remaining_time": "1:03:35"} -{"current_steps": 1239, "total_steps": 2050, "loss": 0.034, "lr": 1.6983667197617386e-06, "epoch": 6.04390243902439, "percentage": 60.44, "elapsed_time": "1:37:02", "remaining_time": "1:03:30"} -{"current_steps": 1240, "total_steps": 2050, "loss": 0.0155, "lr": 1.6947387527831813e-06, "epoch": 6.048780487804878, "percentage": 60.49, "elapsed_time": "1:37:03", "remaining_time": "1:03:24"} -{"current_steps": 1241, "total_steps": 2050, "loss": 0.0078, "lr": 1.6911126769666442e-06, "epoch": 6.053658536585366, "percentage": 60.54, "elapsed_time": "1:37:09", "remaining_time": "1:03:20"} -{"current_steps": 1242, "total_steps": 2050, "loss": 0.1429, "lr": 1.6874885008279945e-06, "epoch": 6.058536585365854, "percentage": 60.59, "elapsed_time": "1:37:15", "remaining_time": "1:03:16"} -{"current_steps": 1243, "total_steps": 2050, "loss": 0.0123, "lr": 1.683866232878637e-06, "epoch": 6.0634146341463415, "percentage": 60.63, "elapsed_time": "1:37:16", "remaining_time": "1:03:09"} -{"current_steps": 1244, "total_steps": 2050, "loss": 0.0139, "lr": 1.6802458816254941e-06, "epoch": 6.068292682926829, "percentage": 60.68, "elapsed_time": "1:37:18", "remaining_time": "1:03:02"} -{"current_steps": 1245, "total_steps": 2050, "loss": 0.0312, "lr": 1.676627455570988e-06, "epoch": 6.073170731707317, "percentage": 60.73, "elapsed_time": "1:37:21", "remaining_time": "1:02:57"} -{"current_steps": 1246, "total_steps": 2050, "loss": 0.0464, "lr": 1.6730109632130199e-06, "epoch": 6.078048780487805, "percentage": 60.78, "elapsed_time": "1:37:26", "remaining_time": "1:02:52"} -{"current_steps": 1247, "total_steps": 2050, "loss": 0.0085, "lr": 1.6693964130449472e-06, "epoch": 6.082926829268293, "percentage": 60.83, "elapsed_time": "1:37:29", "remaining_time": "1:02:46"} -{"current_steps": 1248, "total_steps": 2050, "loss": 0.0482, "lr": 1.6657838135555696e-06, "epoch": 6.087804878048781, "percentage": 60.88, "elapsed_time": "1:37:32", "remaining_time": "1:02:40"} -{"current_steps": 1249, "total_steps": 2050, "loss": 0.0235, "lr": 1.6621731732291024e-06, "epoch": 6.092682926829268, "percentage": 60.93, "elapsed_time": "1:37:33", "remaining_time": "1:02:33"} -{"current_steps": 1250, "total_steps": 2050, "loss": 0.0455, "lr": 1.6585645005451623e-06, "epoch": 6.097560975609756, "percentage": 60.98, "elapsed_time": "1:37:38", "remaining_time": "1:02:29"} -{"current_steps": 1251, "total_steps": 2050, "loss": 0.0499, "lr": 1.6549578039787436e-06, "epoch": 6.102439024390244, "percentage": 61.02, "elapsed_time": "1:37:44", "remaining_time": "1:02:25"} -{"current_steps": 1252, "total_steps": 2050, "loss": 0.0118, "lr": 1.6513530920001998e-06, "epoch": 6.107317073170732, "percentage": 61.07, "elapsed_time": "1:37:46", "remaining_time": "1:02:19"} -{"current_steps": 1253, "total_steps": 2050, "loss": 0.0189, "lr": 1.6477503730752237e-06, "epoch": 6.11219512195122, "percentage": 61.12, "elapsed_time": "1:37:48", "remaining_time": "1:02:12"} -{"current_steps": 1254, "total_steps": 2050, "loss": 0.0492, "lr": 1.6441496556648278e-06, "epoch": 6.117073170731707, "percentage": 61.17, "elapsed_time": "1:37:50", "remaining_time": "1:02:06"} -{"current_steps": 1255, "total_steps": 2050, "loss": 0.1717, "lr": 1.6405509482253234e-06, "epoch": 6.121951219512195, "percentage": 61.22, "elapsed_time": "1:37:55", "remaining_time": "1:02:01"} -{"current_steps": 1256, "total_steps": 2050, "loss": 0.0194, "lr": 1.636954259208302e-06, "epoch": 6.126829268292683, "percentage": 61.27, "elapsed_time": "1:37:58", "remaining_time": "1:01:56"} -{"current_steps": 1257, "total_steps": 2050, "loss": 0.0334, "lr": 1.6333595970606143e-06, "epoch": 6.131707317073171, "percentage": 61.32, "elapsed_time": "1:38:04", "remaining_time": "1:01:52"} -{"current_steps": 1258, "total_steps": 2050, "loss": 0.0705, "lr": 1.62976697022435e-06, "epoch": 6.136585365853659, "percentage": 61.37, "elapsed_time": "1:38:06", "remaining_time": "1:01:45"} -{"current_steps": 1259, "total_steps": 2050, "loss": 0.0322, "lr": 1.6261763871368225e-06, "epoch": 6.1414634146341465, "percentage": 61.41, "elapsed_time": "1:38:07", "remaining_time": "1:01:39"} -{"current_steps": 1260, "total_steps": 2050, "loss": 0.0653, "lr": 1.6225878562305403e-06, "epoch": 6.146341463414634, "percentage": 61.46, "elapsed_time": "1:38:13", "remaining_time": "1:01:35"} -{"current_steps": 1261, "total_steps": 2050, "loss": 0.0557, "lr": 1.6190013859331958e-06, "epoch": 6.151219512195122, "percentage": 61.51, "elapsed_time": "1:38:16", "remaining_time": "1:01:29"} -{"current_steps": 1262, "total_steps": 2050, "loss": 0.0277, "lr": 1.6154169846676415e-06, "epoch": 6.15609756097561, "percentage": 61.56, "elapsed_time": "1:38:21", "remaining_time": "1:01:25"} -{"current_steps": 1263, "total_steps": 2050, "loss": 0.0305, "lr": 1.6118346608518698e-06, "epoch": 6.160975609756098, "percentage": 61.61, "elapsed_time": "1:38:24", "remaining_time": "1:01:19"} -{"current_steps": 1264, "total_steps": 2050, "loss": 0.0093, "lr": 1.6082544228989958e-06, "epoch": 6.1658536585365855, "percentage": 61.66, "elapsed_time": "1:38:25", "remaining_time": "1:01:12"} -{"current_steps": 1265, "total_steps": 2050, "loss": 0.0198, "lr": 1.6046762792172336e-06, "epoch": 6.170731707317073, "percentage": 61.71, "elapsed_time": "1:38:27", "remaining_time": "1:01:05"} -{"current_steps": 1266, "total_steps": 2050, "loss": 0.0673, "lr": 1.6011002382098806e-06, "epoch": 6.175609756097561, "percentage": 61.76, "elapsed_time": "1:38:32", "remaining_time": "1:01:01"} -{"current_steps": 1267, "total_steps": 2050, "loss": 0.0115, "lr": 1.5975263082752968e-06, "epoch": 6.180487804878049, "percentage": 61.8, "elapsed_time": "1:38:36", "remaining_time": "1:00:56"} -{"current_steps": 1268, "total_steps": 2050, "loss": 0.0529, "lr": 1.5939544978068816e-06, "epoch": 6.185365853658537, "percentage": 61.85, "elapsed_time": "1:38:42", "remaining_time": "1:00:52"} -{"current_steps": 1269, "total_steps": 2050, "loss": 0.0643, "lr": 1.590384815193059e-06, "epoch": 6.190243902439025, "percentage": 61.9, "elapsed_time": "1:38:47", "remaining_time": "1:00:48"} -{"current_steps": 1270, "total_steps": 2050, "loss": 0.064, "lr": 1.5868172688172559e-06, "epoch": 6.195121951219512, "percentage": 61.95, "elapsed_time": "1:38:50", "remaining_time": "1:00:42"} -{"current_steps": 1271, "total_steps": 2050, "loss": 0.0676, "lr": 1.5832518670578802e-06, "epoch": 6.2, "percentage": 62.0, "elapsed_time": "1:38:55", "remaining_time": "1:00:37"} -{"current_steps": 1272, "total_steps": 2050, "loss": 0.074, "lr": 1.5796886182883053e-06, "epoch": 6.204878048780488, "percentage": 62.05, "elapsed_time": "1:38:58", "remaining_time": "1:00:32"} -{"current_steps": 1273, "total_steps": 2050, "loss": 0.0311, "lr": 1.5761275308768476e-06, "epoch": 6.209756097560976, "percentage": 62.1, "elapsed_time": "1:39:02", "remaining_time": "1:00:27"} -{"current_steps": 1274, "total_steps": 2050, "loss": 0.0108, "lr": 1.5725686131867462e-06, "epoch": 6.214634146341464, "percentage": 62.15, "elapsed_time": "1:39:04", "remaining_time": "1:00:20"} -{"current_steps": 1275, "total_steps": 2050, "loss": 0.0464, "lr": 1.569011873576147e-06, "epoch": 6.219512195121951, "percentage": 62.2, "elapsed_time": "1:39:07", "remaining_time": "1:00:15"} -{"current_steps": 1276, "total_steps": 2050, "loss": 0.0221, "lr": 1.5654573203980782e-06, "epoch": 6.224390243902439, "percentage": 62.24, "elapsed_time": "1:39:10", "remaining_time": "1:00:09"} -{"current_steps": 1277, "total_steps": 2050, "loss": 0.0693, "lr": 1.5619049620004354e-06, "epoch": 6.229268292682927, "percentage": 62.29, "elapsed_time": "1:39:13", "remaining_time": "1:00:03"} -{"current_steps": 1278, "total_steps": 2050, "loss": 0.0198, "lr": 1.5583548067259584e-06, "epoch": 6.234146341463415, "percentage": 62.34, "elapsed_time": "1:39:19", "remaining_time": "0:59:59"} -{"current_steps": 1279, "total_steps": 2050, "loss": 0.0687, "lr": 1.5548068629122126e-06, "epoch": 6.239024390243903, "percentage": 62.39, "elapsed_time": "1:39:23", "remaining_time": "0:59:54"} -{"current_steps": 1280, "total_steps": 2050, "loss": 0.053, "lr": 1.5512611388915711e-06, "epoch": 6.2439024390243905, "percentage": 62.44, "elapsed_time": "1:39:26", "remaining_time": "0:59:49"} -{"current_steps": 1281, "total_steps": 2050, "loss": 0.2076, "lr": 1.5477176429911934e-06, "epoch": 6.248780487804878, "percentage": 62.49, "elapsed_time": "1:39:30", "remaining_time": "0:59:44"} -{"current_steps": 1282, "total_steps": 2050, "loss": 0.0108, "lr": 1.5441763835330048e-06, "epoch": 6.253658536585366, "percentage": 62.54, "elapsed_time": "1:39:32", "remaining_time": "0:59:37"} -{"current_steps": 1283, "total_steps": 2050, "loss": 0.0114, "lr": 1.5406373688336807e-06, "epoch": 6.258536585365854, "percentage": 62.59, "elapsed_time": "1:39:34", "remaining_time": "0:59:31"} -{"current_steps": 1284, "total_steps": 2050, "loss": 0.0209, "lr": 1.5371006072046225e-06, "epoch": 6.263414634146342, "percentage": 62.63, "elapsed_time": "1:39:37", "remaining_time": "0:59:26"} -{"current_steps": 1285, "total_steps": 2050, "loss": 0.0741, "lr": 1.5335661069519408e-06, "epoch": 6.2682926829268295, "percentage": 62.68, "elapsed_time": "1:39:40", "remaining_time": "0:59:20"} -{"current_steps": 1286, "total_steps": 2050, "loss": 0.0121, "lr": 1.5300338763764371e-06, "epoch": 6.273170731707317, "percentage": 62.73, "elapsed_time": "1:39:42", "remaining_time": "0:59:14"} -{"current_steps": 1287, "total_steps": 2050, "loss": 0.0226, "lr": 1.5265039237735804e-06, "epoch": 6.278048780487805, "percentage": 62.78, "elapsed_time": "1:39:44", "remaining_time": "0:59:07"} -{"current_steps": 1288, "total_steps": 2050, "loss": 0.0116, "lr": 1.5229762574334903e-06, "epoch": 6.282926829268293, "percentage": 62.83, "elapsed_time": "1:39:47", "remaining_time": "0:59:02"} -{"current_steps": 1289, "total_steps": 2050, "loss": 0.0775, "lr": 1.5194508856409181e-06, "epoch": 6.287804878048781, "percentage": 62.88, "elapsed_time": "1:39:49", "remaining_time": "0:58:56"} -{"current_steps": 1290, "total_steps": 2050, "loss": 0.0355, "lr": 1.515927816675225e-06, "epoch": 6.2926829268292686, "percentage": 62.93, "elapsed_time": "1:39:54", "remaining_time": "0:58:51"} -{"current_steps": 1291, "total_steps": 2050, "loss": 0.0127, "lr": 1.5124070588103648e-06, "epoch": 6.297560975609756, "percentage": 62.98, "elapsed_time": "1:39:56", "remaining_time": "0:58:45"} -{"current_steps": 1292, "total_steps": 2050, "loss": 0.0188, "lr": 1.5088886203148643e-06, "epoch": 6.302439024390244, "percentage": 63.02, "elapsed_time": "1:40:00", "remaining_time": "0:58:40"} -{"current_steps": 1293, "total_steps": 2050, "loss": 0.0845, "lr": 1.505372509451801e-06, "epoch": 6.307317073170732, "percentage": 63.07, "elapsed_time": "1:40:03", "remaining_time": "0:58:34"} -{"current_steps": 1294, "total_steps": 2050, "loss": 0.0265, "lr": 1.5018587344787888e-06, "epoch": 6.31219512195122, "percentage": 63.12, "elapsed_time": "1:40:06", "remaining_time": "0:58:29"} -{"current_steps": 1295, "total_steps": 2050, "loss": 0.0833, "lr": 1.498347303647953e-06, "epoch": 6.317073170731708, "percentage": 63.17, "elapsed_time": "1:40:09", "remaining_time": "0:58:23"} -{"current_steps": 1296, "total_steps": 2050, "loss": 0.0416, "lr": 1.4948382252059158e-06, "epoch": 6.321951219512195, "percentage": 63.22, "elapsed_time": "1:40:14", "remaining_time": "0:58:19"} -{"current_steps": 1297, "total_steps": 2050, "loss": 0.0614, "lr": 1.4913315073937742e-06, "epoch": 6.326829268292683, "percentage": 63.27, "elapsed_time": "1:40:17", "remaining_time": "0:58:13"} -{"current_steps": 1298, "total_steps": 2050, "loss": 0.0601, "lr": 1.4878271584470805e-06, "epoch": 6.331707317073171, "percentage": 63.32, "elapsed_time": "1:40:19", "remaining_time": "0:58:07"} -{"current_steps": 1299, "total_steps": 2050, "loss": 0.0189, "lr": 1.4843251865958242e-06, "epoch": 6.336585365853659, "percentage": 63.37, "elapsed_time": "1:40:22", "remaining_time": "0:58:02"} -{"current_steps": 1300, "total_steps": 2050, "loss": 0.038, "lr": 1.4808256000644128e-06, "epoch": 6.341463414634147, "percentage": 63.41, "elapsed_time": "1:40:25", "remaining_time": "0:57:56"} -{"current_steps": 1301, "total_steps": 2050, "loss": 0.041, "lr": 1.4773284070716504e-06, "epoch": 6.3463414634146345, "percentage": 63.46, "elapsed_time": "1:40:28", "remaining_time": "0:57:50"} -{"current_steps": 1302, "total_steps": 2050, "loss": 0.0227, "lr": 1.473833615830722e-06, "epoch": 6.351219512195122, "percentage": 63.51, "elapsed_time": "1:40:31", "remaining_time": "0:57:45"} -{"current_steps": 1303, "total_steps": 2050, "loss": 0.039, "lr": 1.4703412345491692e-06, "epoch": 6.35609756097561, "percentage": 63.56, "elapsed_time": "1:40:38", "remaining_time": "0:57:41"} -{"current_steps": 1304, "total_steps": 2050, "loss": 0.0431, "lr": 1.4668512714288763e-06, "epoch": 6.360975609756098, "percentage": 63.61, "elapsed_time": "1:40:44", "remaining_time": "0:57:37"} -{"current_steps": 1305, "total_steps": 2050, "loss": 0.013, "lr": 1.4633637346660478e-06, "epoch": 6.365853658536586, "percentage": 63.66, "elapsed_time": "1:40:48", "remaining_time": "0:57:33"} -{"current_steps": 1306, "total_steps": 2050, "loss": 0.0181, "lr": 1.4598786324511892e-06, "epoch": 6.3707317073170735, "percentage": 63.71, "elapsed_time": "1:40:50", "remaining_time": "0:57:26"} -{"current_steps": 1307, "total_steps": 2050, "loss": 0.0248, "lr": 1.456395972969089e-06, "epoch": 6.375609756097561, "percentage": 63.76, "elapsed_time": "1:40:57", "remaining_time": "0:57:23"} -{"current_steps": 1308, "total_steps": 2050, "loss": 0.0561, "lr": 1.4529157643987995e-06, "epoch": 6.380487804878049, "percentage": 63.8, "elapsed_time": "1:40:59", "remaining_time": "0:57:17"} -{"current_steps": 1309, "total_steps": 2050, "loss": 0.0593, "lr": 1.4494380149136162e-06, "epoch": 6.385365853658537, "percentage": 63.85, "elapsed_time": "1:41:03", "remaining_time": "0:57:12"} -{"current_steps": 1310, "total_steps": 2050, "loss": 0.0257, "lr": 1.4459627326810576e-06, "epoch": 6.390243902439025, "percentage": 63.9, "elapsed_time": "1:41:05", "remaining_time": "0:57:06"} -{"current_steps": 1311, "total_steps": 2050, "loss": 0.0223, "lr": 1.4424899258628533e-06, "epoch": 6.3951219512195125, "percentage": 63.95, "elapsed_time": "1:41:08", "remaining_time": "0:57:00"} -{"current_steps": 1312, "total_steps": 2050, "loss": 0.0112, "lr": 1.439019602614914e-06, "epoch": 6.4, "percentage": 64.0, "elapsed_time": "1:41:11", "remaining_time": "0:56:55"} -{"current_steps": 1313, "total_steps": 2050, "loss": 0.068, "lr": 1.4355517710873184e-06, "epoch": 6.404878048780488, "percentage": 64.05, "elapsed_time": "1:41:17", "remaining_time": "0:56:51"} -{"current_steps": 1314, "total_steps": 2050, "loss": 0.0825, "lr": 1.432086439424297e-06, "epoch": 6.409756097560976, "percentage": 64.1, "elapsed_time": "1:41:19", "remaining_time": "0:56:45"} -{"current_steps": 1315, "total_steps": 2050, "loss": 0.1812, "lr": 1.428623615764206e-06, "epoch": 6.414634146341464, "percentage": 64.15, "elapsed_time": "1:41:22", "remaining_time": "0:56:39"} -{"current_steps": 1316, "total_steps": 2050, "loss": 0.0207, "lr": 1.4251633082395117e-06, "epoch": 6.419512195121952, "percentage": 64.2, "elapsed_time": "1:41:27", "remaining_time": "0:56:35"} -{"current_steps": 1317, "total_steps": 2050, "loss": 0.0617, "lr": 1.4217055249767734e-06, "epoch": 6.424390243902439, "percentage": 64.24, "elapsed_time": "1:41:34", "remaining_time": "0:56:31"} -{"current_steps": 1318, "total_steps": 2050, "loss": 0.0137, "lr": 1.4182502740966203e-06, "epoch": 6.429268292682927, "percentage": 64.29, "elapsed_time": "1:41:35", "remaining_time": "0:56:25"} -{"current_steps": 1319, "total_steps": 2050, "loss": 0.0329, "lr": 1.4147975637137334e-06, "epoch": 6.434146341463415, "percentage": 64.34, "elapsed_time": "1:41:37", "remaining_time": "0:56:19"} -{"current_steps": 1320, "total_steps": 2050, "loss": 0.0487, "lr": 1.411347401936831e-06, "epoch": 6.439024390243903, "percentage": 64.39, "elapsed_time": "1:41:40", "remaining_time": "0:56:13"} -{"current_steps": 1321, "total_steps": 2050, "loss": 0.0582, "lr": 1.4078997968686425e-06, "epoch": 6.443902439024391, "percentage": 64.44, "elapsed_time": "1:41:43", "remaining_time": "0:56:08"} -{"current_steps": 1322, "total_steps": 2050, "loss": 0.0336, "lr": 1.404454756605893e-06, "epoch": 6.4487804878048784, "percentage": 64.49, "elapsed_time": "1:41:49", "remaining_time": "0:56:04"} -{"current_steps": 1323, "total_steps": 2050, "loss": 0.1372, "lr": 1.4010122892392872e-06, "epoch": 6.453658536585366, "percentage": 64.54, "elapsed_time": "1:41:55", "remaining_time": "0:56:00"} -{"current_steps": 1324, "total_steps": 2050, "loss": 0.0452, "lr": 1.3975724028534842e-06, "epoch": 6.458536585365854, "percentage": 64.59, "elapsed_time": "1:42:01", "remaining_time": "0:55:56"} -{"current_steps": 1325, "total_steps": 2050, "loss": 0.0431, "lr": 1.394135105527083e-06, "epoch": 6.463414634146342, "percentage": 64.63, "elapsed_time": "1:42:04", "remaining_time": "0:55:51"} -{"current_steps": 1326, "total_steps": 2050, "loss": 0.0242, "lr": 1.3907004053326006e-06, "epoch": 6.46829268292683, "percentage": 64.68, "elapsed_time": "1:42:07", "remaining_time": "0:55:45"} -{"current_steps": 1327, "total_steps": 2050, "loss": 0.0293, "lr": 1.387268310336458e-06, "epoch": 6.473170731707317, "percentage": 64.73, "elapsed_time": "1:42:09", "remaining_time": "0:55:39"} -{"current_steps": 1328, "total_steps": 2050, "loss": 0.0232, "lr": 1.3838388285989552e-06, "epoch": 6.478048780487805, "percentage": 64.78, "elapsed_time": "1:42:12", "remaining_time": "0:55:34"} -{"current_steps": 1329, "total_steps": 2050, "loss": 0.0256, "lr": 1.380411968174254e-06, "epoch": 6.482926829268292, "percentage": 64.83, "elapsed_time": "1:42:16", "remaining_time": "0:55:29"} -{"current_steps": 1330, "total_steps": 2050, "loss": 0.1285, "lr": 1.3769877371103635e-06, "epoch": 6.487804878048781, "percentage": 64.88, "elapsed_time": "1:42:19", "remaining_time": "0:55:23"} -{"current_steps": 1331, "total_steps": 2050, "loss": 0.1621, "lr": 1.373566143449115e-06, "epoch": 6.492682926829268, "percentage": 64.93, "elapsed_time": "1:42:25", "remaining_time": "0:55:19"} -{"current_steps": 1332, "total_steps": 2050, "loss": 0.0126, "lr": 1.3701471952261457e-06, "epoch": 6.4975609756097565, "percentage": 64.98, "elapsed_time": "1:42:30", "remaining_time": "0:55:15"} -{"current_steps": 1333, "total_steps": 2050, "loss": 0.0211, "lr": 1.3667309004708832e-06, "epoch": 6.5024390243902435, "percentage": 65.02, "elapsed_time": "1:42:31", "remaining_time": "0:55:09"} -{"current_steps": 1334, "total_steps": 2050, "loss": 0.062, "lr": 1.3633172672065195e-06, "epoch": 6.507317073170732, "percentage": 65.07, "elapsed_time": "1:42:33", "remaining_time": "0:55:02"} -{"current_steps": 1335, "total_steps": 2050, "loss": 0.0126, "lr": 1.359906303449997e-06, "epoch": 6.512195121951219, "percentage": 65.12, "elapsed_time": "1:42:36", "remaining_time": "0:54:57"} -{"current_steps": 1336, "total_steps": 2050, "loss": 0.0111, "lr": 1.3564980172119913e-06, "epoch": 6.517073170731708, "percentage": 65.17, "elapsed_time": "1:42:38", "remaining_time": "0:54:51"} -{"current_steps": 1337, "total_steps": 2050, "loss": 0.1024, "lr": 1.3530924164968873e-06, "epoch": 6.521951219512195, "percentage": 65.22, "elapsed_time": "1:42:42", "remaining_time": "0:54:46"} -{"current_steps": 1338, "total_steps": 2050, "loss": 0.0254, "lr": 1.3496895093027617e-06, "epoch": 6.526829268292683, "percentage": 65.27, "elapsed_time": "1:42:47", "remaining_time": "0:54:41"} -{"current_steps": 1339, "total_steps": 2050, "loss": 0.0188, "lr": 1.3462893036213706e-06, "epoch": 6.53170731707317, "percentage": 65.32, "elapsed_time": "1:42:50", "remaining_time": "0:54:36"} -{"current_steps": 1340, "total_steps": 2050, "loss": 0.0195, "lr": 1.3428918074381203e-06, "epoch": 6.536585365853659, "percentage": 65.37, "elapsed_time": "1:42:56", "remaining_time": "0:54:32"} -{"current_steps": 1341, "total_steps": 2050, "loss": 0.0317, "lr": 1.3394970287320553e-06, "epoch": 6.541463414634146, "percentage": 65.41, "elapsed_time": "1:43:00", "remaining_time": "0:54:27"} -{"current_steps": 1342, "total_steps": 2050, "loss": 0.0191, "lr": 1.3361049754758404e-06, "epoch": 6.546341463414635, "percentage": 65.46, "elapsed_time": "1:43:04", "remaining_time": "0:54:22"} -{"current_steps": 1343, "total_steps": 2050, "loss": 0.0079, "lr": 1.3327156556357369e-06, "epoch": 6.5512195121951216, "percentage": 65.51, "elapsed_time": "1:43:07", "remaining_time": "0:54:17"} -{"current_steps": 1344, "total_steps": 2050, "loss": 0.0345, "lr": 1.3293290771715875e-06, "epoch": 6.55609756097561, "percentage": 65.56, "elapsed_time": "1:43:09", "remaining_time": "0:54:11"} -{"current_steps": 1345, "total_steps": 2050, "loss": 0.0409, "lr": 1.3259452480367963e-06, "epoch": 6.560975609756097, "percentage": 65.61, "elapsed_time": "1:43:15", "remaining_time": "0:54:07"} -{"current_steps": 1346, "total_steps": 2050, "loss": 0.0494, "lr": 1.3225641761783126e-06, "epoch": 6.565853658536585, "percentage": 65.66, "elapsed_time": "1:43:21", "remaining_time": "0:54:03"} -{"current_steps": 1347, "total_steps": 2050, "loss": 0.0842, "lr": 1.3191858695366084e-06, "epoch": 6.570731707317073, "percentage": 65.71, "elapsed_time": "1:43:23", "remaining_time": "0:53:57"} -{"current_steps": 1348, "total_steps": 2050, "loss": 0.0399, "lr": 1.3158103360456603e-06, "epoch": 6.575609756097561, "percentage": 65.76, "elapsed_time": "1:43:30", "remaining_time": "0:53:54"} -{"current_steps": 1349, "total_steps": 2050, "loss": 0.0272, "lr": 1.3124375836329362e-06, "epoch": 6.580487804878048, "percentage": 65.8, "elapsed_time": "1:43:32", "remaining_time": "0:53:48"} -{"current_steps": 1350, "total_steps": 2050, "loss": 0.007, "lr": 1.3090676202193692e-06, "epoch": 6.585365853658536, "percentage": 65.85, "elapsed_time": "1:43:34", "remaining_time": "0:53:42"} -{"current_steps": 1351, "total_steps": 2050, "loss": 0.016, "lr": 1.3057004537193424e-06, "epoch": 6.590243902439024, "percentage": 65.9, "elapsed_time": "1:43:37", "remaining_time": "0:53:37"} -{"current_steps": 1352, "total_steps": 2050, "loss": 0.016, "lr": 1.302336092040673e-06, "epoch": 6.595121951219512, "percentage": 65.95, "elapsed_time": "1:43:40", "remaining_time": "0:53:31"} -{"current_steps": 1353, "total_steps": 2050, "loss": 0.0172, "lr": 1.298974543084589e-06, "epoch": 6.6, "percentage": 66.0, "elapsed_time": "1:43:43", "remaining_time": "0:53:26"} -{"current_steps": 1354, "total_steps": 2050, "loss": 0.0412, "lr": 1.2956158147457116e-06, "epoch": 6.6048780487804875, "percentage": 66.05, "elapsed_time": "1:43:47", "remaining_time": "0:53:21"} -{"current_steps": 1355, "total_steps": 2050, "loss": 0.0181, "lr": 1.2922599149120412e-06, "epoch": 6.609756097560975, "percentage": 66.1, "elapsed_time": "1:43:54", "remaining_time": "0:53:17"} -{"current_steps": 1356, "total_steps": 2050, "loss": 0.04, "lr": 1.2889068514649328e-06, "epoch": 6.614634146341463, "percentage": 66.15, "elapsed_time": "1:44:01", "remaining_time": "0:53:14"} -{"current_steps": 1357, "total_steps": 2050, "loss": 0.0108, "lr": 1.2855566322790796e-06, "epoch": 6.619512195121951, "percentage": 66.2, "elapsed_time": "1:44:04", "remaining_time": "0:53:09"} -{"current_steps": 1358, "total_steps": 2050, "loss": 0.0284, "lr": 1.2822092652224989e-06, "epoch": 6.624390243902439, "percentage": 66.24, "elapsed_time": "1:44:09", "remaining_time": "0:53:04"} -{"current_steps": 1359, "total_steps": 2050, "loss": 0.0128, "lr": 1.2788647581565048e-06, "epoch": 6.6292682926829265, "percentage": 66.29, "elapsed_time": "1:44:11", "remaining_time": "0:52:58"} -{"current_steps": 1360, "total_steps": 2050, "loss": 0.0184, "lr": 1.275523118935697e-06, "epoch": 6.634146341463414, "percentage": 66.34, "elapsed_time": "1:44:15", "remaining_time": "0:52:53"} -{"current_steps": 1361, "total_steps": 2050, "loss": 0.0313, "lr": 1.2721843554079418e-06, "epoch": 6.639024390243902, "percentage": 66.39, "elapsed_time": "1:44:18", "remaining_time": "0:52:48"} -{"current_steps": 1362, "total_steps": 2050, "loss": 0.1184, "lr": 1.2688484754143493e-06, "epoch": 6.64390243902439, "percentage": 66.44, "elapsed_time": "1:44:21", "remaining_time": "0:52:42"} -{"current_steps": 1363, "total_steps": 2050, "loss": 0.0353, "lr": 1.2655154867892577e-06, "epoch": 6.648780487804878, "percentage": 66.49, "elapsed_time": "1:44:23", "remaining_time": "0:52:36"} -{"current_steps": 1364, "total_steps": 2050, "loss": 0.0349, "lr": 1.2621853973602158e-06, "epoch": 6.6536585365853655, "percentage": 66.54, "elapsed_time": "1:44:24", "remaining_time": "0:52:30"} -{"current_steps": 1365, "total_steps": 2050, "loss": 0.0081, "lr": 1.2588582149479645e-06, "epoch": 6.658536585365853, "percentage": 66.59, "elapsed_time": "1:44:26", "remaining_time": "0:52:24"} -{"current_steps": 1366, "total_steps": 2050, "loss": 0.0279, "lr": 1.2555339473664151e-06, "epoch": 6.663414634146341, "percentage": 66.63, "elapsed_time": "1:44:29", "remaining_time": "0:52:19"} -{"current_steps": 1367, "total_steps": 2050, "loss": 0.0492, "lr": 1.2522126024226347e-06, "epoch": 6.668292682926829, "percentage": 66.68, "elapsed_time": "1:44:31", "remaining_time": "0:52:13"} -{"current_steps": 1368, "total_steps": 2050, "loss": 0.0084, "lr": 1.2488941879168278e-06, "epoch": 6.673170731707317, "percentage": 66.73, "elapsed_time": "1:44:33", "remaining_time": "0:52:07"} -{"current_steps": 1369, "total_steps": 2050, "loss": 0.0486, "lr": 1.2455787116423148e-06, "epoch": 6.678048780487805, "percentage": 66.78, "elapsed_time": "1:44:38", "remaining_time": "0:52:03"} -{"current_steps": 1370, "total_steps": 2050, "loss": 0.0319, "lr": 1.2422661813855158e-06, "epoch": 6.682926829268292, "percentage": 66.83, "elapsed_time": "1:44:45", "remaining_time": "0:51:59"} -{"current_steps": 1371, "total_steps": 2050, "loss": 0.016, "lr": 1.238956604925934e-06, "epoch": 6.68780487804878, "percentage": 66.88, "elapsed_time": "1:44:49", "remaining_time": "0:51:54"} -{"current_steps": 1372, "total_steps": 2050, "loss": 0.0557, "lr": 1.2356499900361333e-06, "epoch": 6.692682926829268, "percentage": 66.93, "elapsed_time": "1:44:51", "remaining_time": "0:51:48"} -{"current_steps": 1373, "total_steps": 2050, "loss": 0.0219, "lr": 1.2323463444817227e-06, "epoch": 6.697560975609756, "percentage": 66.98, "elapsed_time": "1:44:55", "remaining_time": "0:51:44"} -{"current_steps": 1374, "total_steps": 2050, "loss": 0.0849, "lr": 1.2290456760213405e-06, "epoch": 6.702439024390244, "percentage": 67.02, "elapsed_time": "1:44:58", "remaining_time": "0:51:38"} -{"current_steps": 1375, "total_steps": 2050, "loss": 0.0857, "lr": 1.2257479924066296e-06, "epoch": 6.7073170731707314, "percentage": 67.07, "elapsed_time": "1:45:05", "remaining_time": "0:51:35"} -{"current_steps": 1376, "total_steps": 2050, "loss": 0.0648, "lr": 1.2224533013822237e-06, "epoch": 6.712195121951219, "percentage": 67.12, "elapsed_time": "1:45:08", "remaining_time": "0:51:29"} -{"current_steps": 1377, "total_steps": 2050, "loss": 0.0426, "lr": 1.2191616106857312e-06, "epoch": 6.717073170731707, "percentage": 67.17, "elapsed_time": "1:45:11", "remaining_time": "0:51:24"} -{"current_steps": 1378, "total_steps": 2050, "loss": 0.0478, "lr": 1.2158729280477112e-06, "epoch": 6.721951219512195, "percentage": 67.22, "elapsed_time": "1:45:13", "remaining_time": "0:51:18"} -{"current_steps": 1379, "total_steps": 2050, "loss": 0.0273, "lr": 1.2125872611916578e-06, "epoch": 6.726829268292683, "percentage": 67.27, "elapsed_time": "1:45:17", "remaining_time": "0:51:14"} -{"current_steps": 1380, "total_steps": 2050, "loss": 0.0201, "lr": 1.2093046178339869e-06, "epoch": 6.7317073170731705, "percentage": 67.32, "elapsed_time": "1:45:20", "remaining_time": "0:51:08"} -{"current_steps": 1381, "total_steps": 2050, "loss": 0.0148, "lr": 1.206025005684009e-06, "epoch": 6.736585365853658, "percentage": 67.37, "elapsed_time": "1:45:24", "remaining_time": "0:51:03"} -{"current_steps": 1382, "total_steps": 2050, "loss": 0.0073, "lr": 1.202748432443918e-06, "epoch": 6.741463414634146, "percentage": 67.41, "elapsed_time": "1:45:26", "remaining_time": "0:50:57"} -{"current_steps": 1383, "total_steps": 2050, "loss": 0.0344, "lr": 1.1994749058087695e-06, "epoch": 6.746341463414634, "percentage": 67.46, "elapsed_time": "1:45:27", "remaining_time": "0:50:51"} -{"current_steps": 1384, "total_steps": 2050, "loss": 0.0837, "lr": 1.196204433466467e-06, "epoch": 6.751219512195122, "percentage": 67.51, "elapsed_time": "1:45:33", "remaining_time": "0:50:47"} -{"current_steps": 1385, "total_steps": 2050, "loss": 0.0425, "lr": 1.192937023097738e-06, "epoch": 6.7560975609756095, "percentage": 67.56, "elapsed_time": "1:45:36", "remaining_time": "0:50:42"} -{"current_steps": 1386, "total_steps": 2050, "loss": 0.0065, "lr": 1.1896726823761195e-06, "epoch": 6.760975609756097, "percentage": 67.61, "elapsed_time": "1:45:39", "remaining_time": "0:50:37"} -{"current_steps": 1387, "total_steps": 2050, "loss": 0.0133, "lr": 1.1864114189679413e-06, "epoch": 6.765853658536585, "percentage": 67.66, "elapsed_time": "1:45:42", "remaining_time": "0:50:31"} -{"current_steps": 1388, "total_steps": 2050, "loss": 0.0188, "lr": 1.183153240532304e-06, "epoch": 6.770731707317073, "percentage": 67.71, "elapsed_time": "1:45:43", "remaining_time": "0:50:25"} -{"current_steps": 1389, "total_steps": 2050, "loss": 0.0234, "lr": 1.179898154721063e-06, "epoch": 6.775609756097561, "percentage": 67.76, "elapsed_time": "1:45:48", "remaining_time": "0:50:21"} -{"current_steps": 1390, "total_steps": 2050, "loss": 0.0208, "lr": 1.1766461691788137e-06, "epoch": 6.780487804878049, "percentage": 67.8, "elapsed_time": "1:45:51", "remaining_time": "0:50:15"} -{"current_steps": 1391, "total_steps": 2050, "loss": 0.0728, "lr": 1.1733972915428665e-06, "epoch": 6.785365853658536, "percentage": 67.85, "elapsed_time": "1:45:54", "remaining_time": "0:50:10"} -{"current_steps": 1392, "total_steps": 2050, "loss": 0.0291, "lr": 1.1701515294432348e-06, "epoch": 6.790243902439024, "percentage": 67.9, "elapsed_time": "1:45:55", "remaining_time": "0:50:04"} -{"current_steps": 1393, "total_steps": 2050, "loss": 0.0988, "lr": 1.1669088905026156e-06, "epoch": 6.795121951219512, "percentage": 67.95, "elapsed_time": "1:45:59", "remaining_time": "0:49:59"} -{"current_steps": 1394, "total_steps": 2050, "loss": 0.0399, "lr": 1.163669382336371e-06, "epoch": 6.8, "percentage": 68.0, "elapsed_time": "1:46:02", "remaining_time": "0:49:53"} -{"current_steps": 1395, "total_steps": 2050, "loss": 0.0134, "lr": 1.160433012552508e-06, "epoch": 6.804878048780488, "percentage": 68.05, "elapsed_time": "1:46:03", "remaining_time": "0:49:47"} -{"current_steps": 1396, "total_steps": 2050, "loss": 0.0795, "lr": 1.1571997887516672e-06, "epoch": 6.809756097560975, "percentage": 68.1, "elapsed_time": "1:46:06", "remaining_time": "0:49:42"} -{"current_steps": 1397, "total_steps": 2050, "loss": 0.0329, "lr": 1.1539697185270982e-06, "epoch": 6.814634146341463, "percentage": 68.15, "elapsed_time": "1:46:08", "remaining_time": "0:49:36"} -{"current_steps": 1398, "total_steps": 2050, "loss": 0.0213, "lr": 1.1507428094646448e-06, "epoch": 6.819512195121951, "percentage": 68.2, "elapsed_time": "1:46:12", "remaining_time": "0:49:31"} -{"current_steps": 1399, "total_steps": 2050, "loss": 0.0172, "lr": 1.1475190691427255e-06, "epoch": 6.824390243902439, "percentage": 68.24, "elapsed_time": "1:46:18", "remaining_time": "0:49:27"} -{"current_steps": 1400, "total_steps": 2050, "loss": 0.0029, "lr": 1.1442985051323205e-06, "epoch": 6.829268292682927, "percentage": 68.29, "elapsed_time": "1:46:19", "remaining_time": "0:49:22"} -{"current_steps": 1401, "total_steps": 2050, "loss": 0.1638, "lr": 1.1410811249969475e-06, "epoch": 6.8341463414634145, "percentage": 68.34, "elapsed_time": "1:46:21", "remaining_time": "0:49:16"} -{"current_steps": 1402, "total_steps": 2050, "loss": 0.0779, "lr": 1.1378669362926468e-06, "epoch": 6.839024390243902, "percentage": 68.39, "elapsed_time": "1:46:27", "remaining_time": "0:49:12"} -{"current_steps": 1403, "total_steps": 2050, "loss": 0.0528, "lr": 1.1346559465679656e-06, "epoch": 6.84390243902439, "percentage": 68.44, "elapsed_time": "1:46:29", "remaining_time": "0:49:06"} -{"current_steps": 1404, "total_steps": 2050, "loss": 0.0057, "lr": 1.1314481633639374e-06, "epoch": 6.848780487804878, "percentage": 68.49, "elapsed_time": "1:46:30", "remaining_time": "0:49:00"} -{"current_steps": 1405, "total_steps": 2050, "loss": 0.1772, "lr": 1.1282435942140632e-06, "epoch": 6.853658536585366, "percentage": 68.54, "elapsed_time": "1:46:34", "remaining_time": "0:48:55"} -{"current_steps": 1406, "total_steps": 2050, "loss": 0.0176, "lr": 1.1250422466442992e-06, "epoch": 6.8585365853658535, "percentage": 68.59, "elapsed_time": "1:46:35", "remaining_time": "0:48:49"} -{"current_steps": 1407, "total_steps": 2050, "loss": 0.0184, "lr": 1.1218441281730334e-06, "epoch": 6.863414634146341, "percentage": 68.63, "elapsed_time": "1:46:38", "remaining_time": "0:48:44"} -{"current_steps": 1408, "total_steps": 2050, "loss": 0.0127, "lr": 1.1186492463110696e-06, "epoch": 6.868292682926829, "percentage": 68.68, "elapsed_time": "1:46:39", "remaining_time": "0:48:38"} -{"current_steps": 1409, "total_steps": 2050, "loss": 0.0094, "lr": 1.1154576085616135e-06, "epoch": 6.873170731707317, "percentage": 68.73, "elapsed_time": "1:46:42", "remaining_time": "0:48:32"} -{"current_steps": 1410, "total_steps": 2050, "loss": 0.0138, "lr": 1.1122692224202491e-06, "epoch": 6.878048780487805, "percentage": 68.78, "elapsed_time": "1:46:45", "remaining_time": "0:48:27"} -{"current_steps": 1411, "total_steps": 2050, "loss": 0.0821, "lr": 1.1090840953749253e-06, "epoch": 6.882926829268293, "percentage": 68.83, "elapsed_time": "1:46:49", "remaining_time": "0:48:22"} -{"current_steps": 1412, "total_steps": 2050, "loss": 0.0222, "lr": 1.1059022349059362e-06, "epoch": 6.88780487804878, "percentage": 68.88, "elapsed_time": "1:46:56", "remaining_time": "0:48:19"} -{"current_steps": 1413, "total_steps": 2050, "loss": 0.1183, "lr": 1.102723648485905e-06, "epoch": 6.892682926829268, "percentage": 68.93, "elapsed_time": "1:46:59", "remaining_time": "0:48:13"} -{"current_steps": 1414, "total_steps": 2050, "loss": 0.0528, "lr": 1.0995483435797643e-06, "epoch": 6.897560975609756, "percentage": 68.98, "elapsed_time": "1:47:01", "remaining_time": "0:48:08"} -{"current_steps": 1415, "total_steps": 2050, "loss": 0.0106, "lr": 1.0963763276447435e-06, "epoch": 6.902439024390244, "percentage": 69.02, "elapsed_time": "1:47:02", "remaining_time": "0:48:02"} -{"current_steps": 1416, "total_steps": 2050, "loss": 0.0454, "lr": 1.0932076081303442e-06, "epoch": 6.907317073170732, "percentage": 69.07, "elapsed_time": "1:47:06", "remaining_time": "0:47:57"} -{"current_steps": 1417, "total_steps": 2050, "loss": 0.022, "lr": 1.0900421924783272e-06, "epoch": 6.912195121951219, "percentage": 69.12, "elapsed_time": "1:47:11", "remaining_time": "0:47:53"} -{"current_steps": 1418, "total_steps": 2050, "loss": 0.0261, "lr": 1.0868800881226962e-06, "epoch": 6.917073170731707, "percentage": 69.17, "elapsed_time": "1:47:15", "remaining_time": "0:47:48"} -{"current_steps": 1419, "total_steps": 2050, "loss": 0.0257, "lr": 1.0837213024896764e-06, "epoch": 6.921951219512195, "percentage": 69.22, "elapsed_time": "1:47:16", "remaining_time": "0:47:42"} -{"current_steps": 1420, "total_steps": 2050, "loss": 0.087, "lr": 1.080565842997698e-06, "epoch": 6.926829268292683, "percentage": 69.27, "elapsed_time": "1:47:20", "remaining_time": "0:47:37"} -{"current_steps": 1421, "total_steps": 2050, "loss": 0.0147, "lr": 1.0774137170573826e-06, "epoch": 6.931707317073171, "percentage": 69.32, "elapsed_time": "1:47:26", "remaining_time": "0:47:33"} -{"current_steps": 1422, "total_steps": 2050, "loss": 0.1183, "lr": 1.074264932071521e-06, "epoch": 6.9365853658536585, "percentage": 69.37, "elapsed_time": "1:47:30", "remaining_time": "0:47:28"} -{"current_steps": 1423, "total_steps": 2050, "loss": 0.0186, "lr": 1.0711194954350568e-06, "epoch": 6.941463414634146, "percentage": 69.41, "elapsed_time": "1:47:32", "remaining_time": "0:47:23"} -{"current_steps": 1424, "total_steps": 2050, "loss": 0.0222, "lr": 1.0679774145350735e-06, "epoch": 6.946341463414634, "percentage": 69.46, "elapsed_time": "1:47:36", "remaining_time": "0:47:18"} -{"current_steps": 1425, "total_steps": 2050, "loss": 0.0824, "lr": 1.0648386967507703e-06, "epoch": 6.951219512195122, "percentage": 69.51, "elapsed_time": "1:47:40", "remaining_time": "0:47:13"} -{"current_steps": 1426, "total_steps": 2050, "loss": 0.0247, "lr": 1.0617033494534486e-06, "epoch": 6.95609756097561, "percentage": 69.56, "elapsed_time": "1:47:43", "remaining_time": "0:47:08"} -{"current_steps": 1427, "total_steps": 2050, "loss": 0.0142, "lr": 1.0585713800064964e-06, "epoch": 6.9609756097560975, "percentage": 69.61, "elapsed_time": "1:47:46", "remaining_time": "0:47:03"} -{"current_steps": 1428, "total_steps": 2050, "loss": 0.0681, "lr": 1.0554427957653663e-06, "epoch": 6.965853658536585, "percentage": 69.66, "elapsed_time": "1:47:49", "remaining_time": "0:46:58"} -{"current_steps": 1429, "total_steps": 2050, "loss": 0.0916, "lr": 1.0523176040775615e-06, "epoch": 6.970731707317073, "percentage": 69.71, "elapsed_time": "1:47:54", "remaining_time": "0:46:53"} -{"current_steps": 1430, "total_steps": 2050, "loss": 0.0611, "lr": 1.0491958122826173e-06, "epoch": 6.975609756097561, "percentage": 69.76, "elapsed_time": "1:48:00", "remaining_time": "0:46:49"} -{"current_steps": 1431, "total_steps": 2050, "loss": 0.0182, "lr": 1.0460774277120866e-06, "epoch": 6.980487804878049, "percentage": 69.8, "elapsed_time": "1:48:06", "remaining_time": "0:46:45"} -{"current_steps": 1432, "total_steps": 2050, "loss": 0.0084, "lr": 1.0429624576895177e-06, "epoch": 6.985365853658537, "percentage": 69.85, "elapsed_time": "1:48:07", "remaining_time": "0:46:39"} -{"current_steps": 1433, "total_steps": 2050, "loss": 0.0411, "lr": 1.03985090953044e-06, "epoch": 6.990243902439024, "percentage": 69.9, "elapsed_time": "1:48:09", "remaining_time": "0:46:34"} -{"current_steps": 1434, "total_steps": 2050, "loss": 0.0464, "lr": 1.0367427905423497e-06, "epoch": 6.995121951219512, "percentage": 69.95, "elapsed_time": "1:48:14", "remaining_time": "0:46:30"} -{"current_steps": 1435, "total_steps": 2050, "loss": 0.0124, "lr": 1.0336381080246858e-06, "epoch": 7.0, "percentage": 70.0, "elapsed_time": "1:48:18", "remaining_time": "0:46:24"} -{"current_steps": 1436, "total_steps": 2050, "loss": 0.0179, "lr": 1.0305368692688175e-06, "epoch": 7.004878048780488, "percentage": 70.05, "elapsed_time": "1:51:55", "remaining_time": "0:47:51"} -{"current_steps": 1437, "total_steps": 2050, "loss": 0.0119, "lr": 1.027439081558029e-06, "epoch": 7.009756097560976, "percentage": 70.1, "elapsed_time": "1:51:58", "remaining_time": "0:47:45"} -{"current_steps": 1438, "total_steps": 2050, "loss": 0.0278, "lr": 1.0243447521674967e-06, "epoch": 7.014634146341463, "percentage": 70.15, "elapsed_time": "1:52:00", "remaining_time": "0:47:40"} -{"current_steps": 1439, "total_steps": 2050, "loss": 0.1259, "lr": 1.021253888364276e-06, "epoch": 7.019512195121951, "percentage": 70.2, "elapsed_time": "1:52:03", "remaining_time": "0:47:34"} -{"current_steps": 1440, "total_steps": 2050, "loss": 0.0047, "lr": 1.018166497407284e-06, "epoch": 7.024390243902439, "percentage": 70.24, "elapsed_time": "1:52:06", "remaining_time": "0:47:29"} -{"current_steps": 1441, "total_steps": 2050, "loss": 0.0044, "lr": 1.0150825865472813e-06, "epoch": 7.029268292682927, "percentage": 70.29, "elapsed_time": "1:52:10", "remaining_time": "0:47:24"} -{"current_steps": 1442, "total_steps": 2050, "loss": 0.0044, "lr": 1.0120021630268542e-06, "epoch": 7.034146341463415, "percentage": 70.34, "elapsed_time": "1:52:12", "remaining_time": "0:47:18"} -{"current_steps": 1443, "total_steps": 2050, "loss": 0.0081, "lr": 1.0089252340804025e-06, "epoch": 7.0390243902439025, "percentage": 70.39, "elapsed_time": "1:52:18", "remaining_time": "0:47:14"} -{"current_steps": 1444, "total_steps": 2050, "loss": 0.0318, "lr": 1.0058518069341152e-06, "epoch": 7.04390243902439, "percentage": 70.44, "elapsed_time": "1:52:20", "remaining_time": "0:47:08"} -{"current_steps": 1445, "total_steps": 2050, "loss": 0.0067, "lr": 1.002781888805958e-06, "epoch": 7.048780487804878, "percentage": 70.49, "elapsed_time": "1:52:22", "remaining_time": "0:47:02"} -{"current_steps": 1446, "total_steps": 2050, "loss": 0.0064, "lr": 9.997154869056588e-07, "epoch": 7.053658536585366, "percentage": 70.54, "elapsed_time": "1:52:26", "remaining_time": "0:46:58"} -{"current_steps": 1447, "total_steps": 2050, "loss": 0.0057, "lr": 9.966526084346837e-07, "epoch": 7.058536585365854, "percentage": 70.59, "elapsed_time": "1:52:30", "remaining_time": "0:46:52"} -{"current_steps": 1448, "total_steps": 2050, "loss": 0.0365, "lr": 9.935932605862258e-07, "epoch": 7.0634146341463415, "percentage": 70.63, "elapsed_time": "1:52:32", "remaining_time": "0:46:47"} -{"current_steps": 1449, "total_steps": 2050, "loss": 0.0345, "lr": 9.905374505451853e-07, "epoch": 7.068292682926829, "percentage": 70.68, "elapsed_time": "1:52:36", "remaining_time": "0:46:42"} -{"current_steps": 1450, "total_steps": 2050, "loss": 0.0384, "lr": 9.874851854881565e-07, "epoch": 7.073170731707317, "percentage": 70.73, "elapsed_time": "1:52:39", "remaining_time": "0:46:36"} -{"current_steps": 1451, "total_steps": 2050, "loss": 0.0116, "lr": 9.844364725834058e-07, "epoch": 7.078048780487805, "percentage": 70.78, "elapsed_time": "1:52:41", "remaining_time": "0:46:31"} -{"current_steps": 1452, "total_steps": 2050, "loss": 0.0267, "lr": 9.813913189908571e-07, "epoch": 7.082926829268293, "percentage": 70.83, "elapsed_time": "1:52:44", "remaining_time": "0:46:26"} -{"current_steps": 1453, "total_steps": 2050, "loss": 0.0376, "lr": 9.783497318620783e-07, "epoch": 7.087804878048781, "percentage": 70.88, "elapsed_time": "1:52:48", "remaining_time": "0:46:21"} -{"current_steps": 1454, "total_steps": 2050, "loss": 0.0057, "lr": 9.75311718340258e-07, "epoch": 7.092682926829268, "percentage": 70.93, "elapsed_time": "1:52:51", "remaining_time": "0:46:15"} -{"current_steps": 1455, "total_steps": 2050, "loss": 0.0386, "lr": 9.722772855601927e-07, "epoch": 7.097560975609756, "percentage": 70.98, "elapsed_time": "1:52:56", "remaining_time": "0:46:11"} -{"current_steps": 1456, "total_steps": 2050, "loss": 0.006, "lr": 9.692464406482727e-07, "epoch": 7.102439024390244, "percentage": 71.02, "elapsed_time": "1:53:01", "remaining_time": "0:46:06"} -{"current_steps": 1457, "total_steps": 2050, "loss": 0.0066, "lr": 9.662191907224582e-07, "epoch": 7.107317073170732, "percentage": 71.07, "elapsed_time": "1:53:02", "remaining_time": "0:46:00"} -{"current_steps": 1458, "total_steps": 2050, "loss": 0.0201, "lr": 9.63195542892268e-07, "epoch": 7.11219512195122, "percentage": 71.12, "elapsed_time": "1:53:06", "remaining_time": "0:45:55"} -{"current_steps": 1459, "total_steps": 2050, "loss": 0.0112, "lr": 9.601755042587624e-07, "epoch": 7.117073170731707, "percentage": 71.17, "elapsed_time": "1:53:11", "remaining_time": "0:45:51"} -{"current_steps": 1460, "total_steps": 2050, "loss": 0.0066, "lr": 9.571590819145244e-07, "epoch": 7.121951219512195, "percentage": 71.22, "elapsed_time": "1:53:16", "remaining_time": "0:45:46"} -{"current_steps": 1461, "total_steps": 2050, "loss": 0.0136, "lr": 9.541462829436426e-07, "epoch": 7.126829268292683, "percentage": 71.27, "elapsed_time": "1:53:20", "remaining_time": "0:45:41"} -{"current_steps": 1462, "total_steps": 2050, "loss": 0.0228, "lr": 9.511371144217005e-07, "epoch": 7.131707317073171, "percentage": 71.32, "elapsed_time": "1:53:22", "remaining_time": "0:45:35"} -{"current_steps": 1463, "total_steps": 2050, "loss": 0.027, "lr": 9.481315834157512e-07, "epoch": 7.136585365853659, "percentage": 71.37, "elapsed_time": "1:53:24", "remaining_time": "0:45:30"} -{"current_steps": 1464, "total_steps": 2050, "loss": 0.0152, "lr": 9.451296969843058e-07, "epoch": 7.1414634146341465, "percentage": 71.41, "elapsed_time": "1:53:27", "remaining_time": "0:45:24"} -{"current_steps": 1465, "total_steps": 2050, "loss": 0.0071, "lr": 9.42131462177319e-07, "epoch": 7.146341463414634, "percentage": 71.46, "elapsed_time": "1:53:30", "remaining_time": "0:45:19"} -{"current_steps": 1466, "total_steps": 2050, "loss": 0.0054, "lr": 9.39136886036166e-07, "epoch": 7.151219512195122, "percentage": 71.51, "elapsed_time": "1:53:32", "remaining_time": "0:45:13"} -{"current_steps": 1467, "total_steps": 2050, "loss": 0.0067, "lr": 9.361459755936316e-07, "epoch": 7.15609756097561, "percentage": 71.56, "elapsed_time": "1:53:34", "remaining_time": "0:45:08"} -{"current_steps": 1468, "total_steps": 2050, "loss": 0.0105, "lr": 9.331587378738902e-07, "epoch": 7.160975609756098, "percentage": 71.61, "elapsed_time": "1:53:35", "remaining_time": "0:45:02"} -{"current_steps": 1469, "total_steps": 2050, "loss": 0.0331, "lr": 9.301751798924935e-07, "epoch": 7.1658536585365855, "percentage": 71.66, "elapsed_time": "1:53:39", "remaining_time": "0:44:57"} -{"current_steps": 1470, "total_steps": 2050, "loss": 0.0076, "lr": 9.27195308656349e-07, "epoch": 7.170731707317073, "percentage": 71.71, "elapsed_time": "1:53:42", "remaining_time": "0:44:51"} -{"current_steps": 1471, "total_steps": 2050, "loss": 0.0243, "lr": 9.24219131163705e-07, "epoch": 7.175609756097561, "percentage": 71.76, "elapsed_time": "1:53:43", "remaining_time": "0:44:45"} -{"current_steps": 1472, "total_steps": 2050, "loss": 0.0051, "lr": 9.212466544041385e-07, "epoch": 7.180487804878049, "percentage": 71.8, "elapsed_time": "1:53:45", "remaining_time": "0:44:40"} -{"current_steps": 1473, "total_steps": 2050, "loss": 0.0146, "lr": 9.182778853585325e-07, "epoch": 7.185365853658537, "percentage": 71.85, "elapsed_time": "1:53:46", "remaining_time": "0:44:34"} -{"current_steps": 1474, "total_steps": 2050, "loss": 0.0273, "lr": 9.153128309990622e-07, "epoch": 7.190243902439025, "percentage": 71.9, "elapsed_time": "1:53:50", "remaining_time": "0:44:29"} -{"current_steps": 1475, "total_steps": 2050, "loss": 0.0225, "lr": 9.123514982891813e-07, "epoch": 7.195121951219512, "percentage": 71.95, "elapsed_time": "1:53:57", "remaining_time": "0:44:25"} -{"current_steps": 1476, "total_steps": 2050, "loss": 0.0156, "lr": 9.093938941836012e-07, "epoch": 7.2, "percentage": 72.0, "elapsed_time": "1:54:03", "remaining_time": "0:44:21"} -{"current_steps": 1477, "total_steps": 2050, "loss": 0.0158, "lr": 9.064400256282757e-07, "epoch": 7.204878048780488, "percentage": 72.05, "elapsed_time": "1:54:06", "remaining_time": "0:44:16"} -{"current_steps": 1478, "total_steps": 2050, "loss": 0.0138, "lr": 9.034898995603894e-07, "epoch": 7.209756097560976, "percentage": 72.1, "elapsed_time": "1:54:11", "remaining_time": "0:44:11"} -{"current_steps": 1479, "total_steps": 2050, "loss": 0.0308, "lr": 9.00543522908334e-07, "epoch": 7.214634146341464, "percentage": 72.15, "elapsed_time": "1:54:15", "remaining_time": "0:44:06"} -{"current_steps": 1480, "total_steps": 2050, "loss": 0.006, "lr": 8.976009025916962e-07, "epoch": 7.219512195121951, "percentage": 72.2, "elapsed_time": "1:54:18", "remaining_time": "0:44:01"} -{"current_steps": 1481, "total_steps": 2050, "loss": 0.0121, "lr": 8.946620455212438e-07, "epoch": 7.224390243902439, "percentage": 72.24, "elapsed_time": "1:54:21", "remaining_time": "0:43:56"} -{"current_steps": 1482, "total_steps": 2050, "loss": 0.0424, "lr": 8.917269585989027e-07, "epoch": 7.229268292682927, "percentage": 72.29, "elapsed_time": "1:54:24", "remaining_time": "0:43:50"} -{"current_steps": 1483, "total_steps": 2050, "loss": 0.0189, "lr": 8.887956487177462e-07, "epoch": 7.234146341463415, "percentage": 72.34, "elapsed_time": "1:54:30", "remaining_time": "0:43:46"} -{"current_steps": 1484, "total_steps": 2050, "loss": 0.0118, "lr": 8.858681227619789e-07, "epoch": 7.239024390243903, "percentage": 72.39, "elapsed_time": "1:54:33", "remaining_time": "0:43:41"} -{"current_steps": 1485, "total_steps": 2050, "loss": 0.0043, "lr": 8.829443876069163e-07, "epoch": 7.2439024390243905, "percentage": 72.44, "elapsed_time": "1:54:38", "remaining_time": "0:43:36"} -{"current_steps": 1486, "total_steps": 2050, "loss": 0.0111, "lr": 8.800244501189722e-07, "epoch": 7.248780487804878, "percentage": 72.49, "elapsed_time": "1:54:41", "remaining_time": "0:43:31"} -{"current_steps": 1487, "total_steps": 2050, "loss": 0.0582, "lr": 8.771083171556407e-07, "epoch": 7.253658536585366, "percentage": 72.54, "elapsed_time": "1:54:47", "remaining_time": "0:43:27"} -{"current_steps": 1488, "total_steps": 2050, "loss": 0.0051, "lr": 8.741959955654833e-07, "epoch": 7.258536585365854, "percentage": 72.59, "elapsed_time": "1:54:52", "remaining_time": "0:43:23"} -{"current_steps": 1489, "total_steps": 2050, "loss": 0.0175, "lr": 8.712874921881082e-07, "epoch": 7.263414634146342, "percentage": 72.63, "elapsed_time": "1:54:59", "remaining_time": "0:43:19"} -{"current_steps": 1490, "total_steps": 2050, "loss": 0.0827, "lr": 8.683828138541559e-07, "epoch": 7.2682926829268295, "percentage": 72.68, "elapsed_time": "1:55:03", "remaining_time": "0:43:14"} -{"current_steps": 1491, "total_steps": 2050, "loss": 0.0347, "lr": 8.654819673852874e-07, "epoch": 7.273170731707317, "percentage": 72.73, "elapsed_time": "1:55:06", "remaining_time": "0:43:09"} -{"current_steps": 1492, "total_steps": 2050, "loss": 0.0055, "lr": 8.625849595941608e-07, "epoch": 7.278048780487805, "percentage": 72.78, "elapsed_time": "1:55:10", "remaining_time": "0:43:04"} -{"current_steps": 1493, "total_steps": 2050, "loss": 0.0043, "lr": 8.596917972844199e-07, "epoch": 7.282926829268293, "percentage": 72.83, "elapsed_time": "1:55:13", "remaining_time": "0:42:59"} -{"current_steps": 1494, "total_steps": 2050, "loss": 0.0176, "lr": 8.568024872506792e-07, "epoch": 7.287804878048781, "percentage": 72.88, "elapsed_time": "1:55:15", "remaining_time": "0:42:53"} -{"current_steps": 1495, "total_steps": 2050, "loss": 0.0025, "lr": 8.539170362785043e-07, "epoch": 7.2926829268292686, "percentage": 72.93, "elapsed_time": "1:55:18", "remaining_time": "0:42:48"} -{"current_steps": 1496, "total_steps": 2050, "loss": 0.0093, "lr": 8.510354511443975e-07, "epoch": 7.297560975609756, "percentage": 72.98, "elapsed_time": "1:55:24", "remaining_time": "0:42:44"} -{"current_steps": 1497, "total_steps": 2050, "loss": 0.0066, "lr": 8.48157738615784e-07, "epoch": 7.302439024390244, "percentage": 73.02, "elapsed_time": "1:55:27", "remaining_time": "0:42:39"} -{"current_steps": 1498, "total_steps": 2050, "loss": 0.0055, "lr": 8.452839054509926e-07, "epoch": 7.307317073170732, "percentage": 73.07, "elapsed_time": "1:55:32", "remaining_time": "0:42:34"} -{"current_steps": 1499, "total_steps": 2050, "loss": 0.0059, "lr": 8.42413958399241e-07, "epoch": 7.31219512195122, "percentage": 73.12, "elapsed_time": "1:55:35", "remaining_time": "0:42:29"} -{"current_steps": 1500, "total_steps": 2050, "loss": 0.0049, "lr": 8.39547904200623e-07, "epoch": 7.317073170731708, "percentage": 73.17, "elapsed_time": "1:55:37", "remaining_time": "0:42:23"} -{"current_steps": 1501, "total_steps": 2050, "loss": 0.0204, "lr": 8.366857495860869e-07, "epoch": 7.321951219512195, "percentage": 73.22, "elapsed_time": "1:55:41", "remaining_time": "0:42:19"} -{"current_steps": 1502, "total_steps": 2050, "loss": 0.0161, "lr": 8.338275012774247e-07, "epoch": 7.326829268292683, "percentage": 73.27, "elapsed_time": "1:55:45", "remaining_time": "0:42:13"} -{"current_steps": 1503, "total_steps": 2050, "loss": 0.0088, "lr": 8.309731659872522e-07, "epoch": 7.331707317073171, "percentage": 73.32, "elapsed_time": "1:55:48", "remaining_time": "0:42:08"} -{"current_steps": 1504, "total_steps": 2050, "loss": 0.0204, "lr": 8.281227504189992e-07, "epoch": 7.336585365853659, "percentage": 73.37, "elapsed_time": "1:55:53", "remaining_time": "0:42:04"} -{"current_steps": 1505, "total_steps": 2050, "loss": 0.0238, "lr": 8.252762612668869e-07, "epoch": 7.341463414634147, "percentage": 73.41, "elapsed_time": "1:56:00", "remaining_time": "0:42:00"} -{"current_steps": 1506, "total_steps": 2050, "loss": 0.0063, "lr": 8.224337052159154e-07, "epoch": 7.3463414634146345, "percentage": 73.46, "elapsed_time": "1:56:01", "remaining_time": "0:41:54"} -{"current_steps": 1507, "total_steps": 2050, "loss": 0.0123, "lr": 8.195950889418503e-07, "epoch": 7.351219512195122, "percentage": 73.51, "elapsed_time": "1:56:04", "remaining_time": "0:41:49"} -{"current_steps": 1508, "total_steps": 2050, "loss": 0.0296, "lr": 8.167604191112021e-07, "epoch": 7.35609756097561, "percentage": 73.56, "elapsed_time": "1:56:06", "remaining_time": "0:41:43"} -{"current_steps": 1509, "total_steps": 2050, "loss": 0.0148, "lr": 8.139297023812131e-07, "epoch": 7.360975609756098, "percentage": 73.61, "elapsed_time": "1:56:10", "remaining_time": "0:41:38"} -{"current_steps": 1510, "total_steps": 2050, "loss": 0.0057, "lr": 8.111029453998448e-07, "epoch": 7.365853658536586, "percentage": 73.66, "elapsed_time": "1:56:14", "remaining_time": "0:41:34"} -{"current_steps": 1511, "total_steps": 2050, "loss": 0.0098, "lr": 8.082801548057553e-07, "epoch": 7.3707317073170735, "percentage": 73.71, "elapsed_time": "1:56:16", "remaining_time": "0:41:28"} -{"current_steps": 1512, "total_steps": 2050, "loss": 0.007, "lr": 8.05461337228289e-07, "epoch": 7.375609756097561, "percentage": 73.76, "elapsed_time": "1:56:21", "remaining_time": "0:41:24"} -{"current_steps": 1513, "total_steps": 2050, "loss": 0.0058, "lr": 8.026464992874617e-07, "epoch": 7.380487804878049, "percentage": 73.8, "elapsed_time": "1:56:23", "remaining_time": "0:41:18"} -{"current_steps": 1514, "total_steps": 2050, "loss": 0.0011, "lr": 7.998356475939398e-07, "epoch": 7.385365853658537, "percentage": 73.85, "elapsed_time": "1:56:24", "remaining_time": "0:41:12"} -{"current_steps": 1515, "total_steps": 2050, "loss": 0.0041, "lr": 7.970287887490289e-07, "epoch": 7.390243902439025, "percentage": 73.9, "elapsed_time": "1:56:27", "remaining_time": "0:41:07"} -{"current_steps": 1516, "total_steps": 2050, "loss": 0.0027, "lr": 7.942259293446594e-07, "epoch": 7.3951219512195125, "percentage": 73.95, "elapsed_time": "1:56:30", "remaining_time": "0:41:02"} -{"current_steps": 1517, "total_steps": 2050, "loss": 0.006, "lr": 7.914270759633669e-07, "epoch": 7.4, "percentage": 74.0, "elapsed_time": "1:56:31", "remaining_time": "0:40:56"} -{"current_steps": 1518, "total_steps": 2050, "loss": 0.0066, "lr": 7.886322351782782e-07, "epoch": 7.404878048780488, "percentage": 74.05, "elapsed_time": "1:56:33", "remaining_time": "0:40:50"} -{"current_steps": 1519, "total_steps": 2050, "loss": 0.0133, "lr": 7.858414135530995e-07, "epoch": 7.409756097560976, "percentage": 74.1, "elapsed_time": "1:56:36", "remaining_time": "0:40:45"} -{"current_steps": 1520, "total_steps": 2050, "loss": 0.0092, "lr": 7.83054617642095e-07, "epoch": 7.414634146341464, "percentage": 74.15, "elapsed_time": "1:56:39", "remaining_time": "0:40:40"} -{"current_steps": 1521, "total_steps": 2050, "loss": 0.0113, "lr": 7.802718539900761e-07, "epoch": 7.419512195121952, "percentage": 74.2, "elapsed_time": "1:56:41", "remaining_time": "0:40:35"} -{"current_steps": 1522, "total_steps": 2050, "loss": 0.0045, "lr": 7.774931291323826e-07, "epoch": 7.424390243902439, "percentage": 74.24, "elapsed_time": "1:56:45", "remaining_time": "0:40:30"} -{"current_steps": 1523, "total_steps": 2050, "loss": 0.0692, "lr": 7.747184495948723e-07, "epoch": 7.429268292682927, "percentage": 74.29, "elapsed_time": "1:56:50", "remaining_time": "0:40:25"} -{"current_steps": 1524, "total_steps": 2050, "loss": 0.0462, "lr": 7.719478218939e-07, "epoch": 7.434146341463415, "percentage": 74.34, "elapsed_time": "1:56:54", "remaining_time": "0:40:20"} -{"current_steps": 1525, "total_steps": 2050, "loss": 0.008, "lr": 7.691812525363044e-07, "epoch": 7.439024390243903, "percentage": 74.39, "elapsed_time": "1:57:01", "remaining_time": "0:40:17"} -{"current_steps": 1526, "total_steps": 2050, "loss": 0.0042, "lr": 7.66418748019396e-07, "epoch": 7.443902439024391, "percentage": 74.44, "elapsed_time": "1:57:04", "remaining_time": "0:40:12"} -{"current_steps": 1527, "total_steps": 2050, "loss": 0.0033, "lr": 7.636603148309363e-07, "epoch": 7.4487804878048784, "percentage": 74.49, "elapsed_time": "1:57:07", "remaining_time": "0:40:06"} -{"current_steps": 1528, "total_steps": 2050, "loss": 0.0181, "lr": 7.609059594491253e-07, "epoch": 7.453658536585366, "percentage": 74.54, "elapsed_time": "1:57:13", "remaining_time": "0:40:02"} -{"current_steps": 1529, "total_steps": 2050, "loss": 0.1868, "lr": 7.581556883425886e-07, "epoch": 7.458536585365854, "percentage": 74.59, "elapsed_time": "1:57:17", "remaining_time": "0:39:57"} -{"current_steps": 1530, "total_steps": 2050, "loss": 0.0305, "lr": 7.55409507970358e-07, "epoch": 7.463414634146342, "percentage": 74.63, "elapsed_time": "1:57:23", "remaining_time": "0:39:53"} -{"current_steps": 1531, "total_steps": 2050, "loss": 0.0027, "lr": 7.526674247818569e-07, "epoch": 7.46829268292683, "percentage": 74.68, "elapsed_time": "1:57:26", "remaining_time": "0:39:48"} -{"current_steps": 1532, "total_steps": 2050, "loss": 0.019, "lr": 7.499294452168904e-07, "epoch": 7.473170731707317, "percentage": 74.73, "elapsed_time": "1:57:30", "remaining_time": "0:39:44"} -{"current_steps": 1533, "total_steps": 2050, "loss": 0.0101, "lr": 7.471955757056227e-07, "epoch": 7.478048780487805, "percentage": 74.78, "elapsed_time": "1:57:36", "remaining_time": "0:39:39"} -{"current_steps": 1534, "total_steps": 2050, "loss": 0.0066, "lr": 7.444658226685656e-07, "epoch": 7.482926829268292, "percentage": 74.83, "elapsed_time": "1:57:42", "remaining_time": "0:39:35"} -{"current_steps": 1535, "total_steps": 2050, "loss": 0.0139, "lr": 7.417401925165666e-07, "epoch": 7.487804878048781, "percentage": 74.88, "elapsed_time": "1:57:43", "remaining_time": "0:39:29"} -{"current_steps": 1536, "total_steps": 2050, "loss": 0.0053, "lr": 7.390186916507869e-07, "epoch": 7.492682926829268, "percentage": 74.93, "elapsed_time": "1:57:46", "remaining_time": "0:39:24"} -{"current_steps": 1537, "total_steps": 2050, "loss": 0.0031, "lr": 7.363013264626914e-07, "epoch": 7.4975609756097565, "percentage": 74.98, "elapsed_time": "1:57:48", "remaining_time": "0:39:19"} -{"current_steps": 1538, "total_steps": 2050, "loss": 0.0257, "lr": 7.335881033340334e-07, "epoch": 7.5024390243902435, "percentage": 75.02, "elapsed_time": "1:57:54", "remaining_time": "0:39:14"} -{"current_steps": 1539, "total_steps": 2050, "loss": 0.0092, "lr": 7.308790286368373e-07, "epoch": 7.507317073170732, "percentage": 75.07, "elapsed_time": "1:57:58", "remaining_time": "0:39:10"} -{"current_steps": 1540, "total_steps": 2050, "loss": 0.024, "lr": 7.281741087333846e-07, "epoch": 7.512195121951219, "percentage": 75.12, "elapsed_time": "1:58:00", "remaining_time": "0:39:04"} -{"current_steps": 1541, "total_steps": 2050, "loss": 0.0177, "lr": 7.254733499761993e-07, "epoch": 7.517073170731708, "percentage": 75.17, "elapsed_time": "1:58:06", "remaining_time": "0:39:00"} -{"current_steps": 1542, "total_steps": 2050, "loss": 0.0439, "lr": 7.22776758708035e-07, "epoch": 7.521951219512195, "percentage": 75.22, "elapsed_time": "1:58:11", "remaining_time": "0:38:56"} -{"current_steps": 1543, "total_steps": 2050, "loss": 0.0387, "lr": 7.200843412618555e-07, "epoch": 7.526829268292683, "percentage": 75.27, "elapsed_time": "1:58:14", "remaining_time": "0:38:51"} -{"current_steps": 1544, "total_steps": 2050, "loss": 0.0082, "lr": 7.173961039608227e-07, "epoch": 7.53170731707317, "percentage": 75.32, "elapsed_time": "1:58:20", "remaining_time": "0:38:47"} -{"current_steps": 1545, "total_steps": 2050, "loss": 0.0194, "lr": 7.147120531182828e-07, "epoch": 7.536585365853659, "percentage": 75.37, "elapsed_time": "1:58:24", "remaining_time": "0:38:42"} -{"current_steps": 1546, "total_steps": 2050, "loss": 0.006, "lr": 7.120321950377487e-07, "epoch": 7.541463414634146, "percentage": 75.41, "elapsed_time": "1:58:26", "remaining_time": "0:38:36"} -{"current_steps": 1547, "total_steps": 2050, "loss": 0.0104, "lr": 7.093565360128863e-07, "epoch": 7.546341463414635, "percentage": 75.46, "elapsed_time": "1:58:32", "remaining_time": "0:38:32"} -{"current_steps": 1548, "total_steps": 2050, "loss": 0.0173, "lr": 7.066850823275024e-07, "epoch": 7.5512195121951216, "percentage": 75.51, "elapsed_time": "1:58:36", "remaining_time": "0:38:27"} -{"current_steps": 1549, "total_steps": 2050, "loss": 0.0088, "lr": 7.040178402555245e-07, "epoch": 7.55609756097561, "percentage": 75.56, "elapsed_time": "1:58:37", "remaining_time": "0:38:22"} -{"current_steps": 1550, "total_steps": 2050, "loss": 0.0098, "lr": 7.013548160609901e-07, "epoch": 7.560975609756097, "percentage": 75.61, "elapsed_time": "1:58:42", "remaining_time": "0:38:17"} -{"current_steps": 1551, "total_steps": 2050, "loss": 0.0196, "lr": 6.986960159980327e-07, "epoch": 7.565853658536585, "percentage": 75.66, "elapsed_time": "1:58:46", "remaining_time": "0:38:12"} -{"current_steps": 1552, "total_steps": 2050, "loss": 0.021, "lr": 6.960414463108631e-07, "epoch": 7.570731707317073, "percentage": 75.71, "elapsed_time": "1:58:48", "remaining_time": "0:38:07"} -{"current_steps": 1553, "total_steps": 2050, "loss": 0.0076, "lr": 6.933911132337575e-07, "epoch": 7.575609756097561, "percentage": 75.76, "elapsed_time": "1:58:53", "remaining_time": "0:38:02"} -{"current_steps": 1554, "total_steps": 2050, "loss": 0.0204, "lr": 6.907450229910443e-07, "epoch": 7.580487804878048, "percentage": 75.8, "elapsed_time": "1:58:55", "remaining_time": "0:37:57"} -{"current_steps": 1555, "total_steps": 2050, "loss": 0.0046, "lr": 6.881031817970848e-07, "epoch": 7.585365853658536, "percentage": 75.85, "elapsed_time": "1:58:58", "remaining_time": "0:37:52"} -{"current_steps": 1556, "total_steps": 2050, "loss": 0.0566, "lr": 6.854655958562625e-07, "epoch": 7.590243902439024, "percentage": 75.9, "elapsed_time": "1:59:01", "remaining_time": "0:37:47"} -{"current_steps": 1557, "total_steps": 2050, "loss": 0.0048, "lr": 6.82832271362969e-07, "epoch": 7.595121951219512, "percentage": 75.95, "elapsed_time": "1:59:04", "remaining_time": "0:37:42"} -{"current_steps": 1558, "total_steps": 2050, "loss": 0.0351, "lr": 6.802032145015855e-07, "epoch": 7.6, "percentage": 76.0, "elapsed_time": "1:59:07", "remaining_time": "0:37:37"} -{"current_steps": 1559, "total_steps": 2050, "loss": 0.0171, "lr": 6.775784314464717e-07, "epoch": 7.6048780487804875, "percentage": 76.05, "elapsed_time": "1:59:09", "remaining_time": "0:37:31"} -{"current_steps": 1560, "total_steps": 2050, "loss": 0.01, "lr": 6.749579283619492e-07, "epoch": 7.609756097560975, "percentage": 76.1, "elapsed_time": "1:59:12", "remaining_time": "0:37:26"} -{"current_steps": 1561, "total_steps": 2050, "loss": 0.0162, "lr": 6.723417114022907e-07, "epoch": 7.614634146341463, "percentage": 76.15, "elapsed_time": "1:59:16", "remaining_time": "0:37:21"} -{"current_steps": 1562, "total_steps": 2050, "loss": 0.0237, "lr": 6.697297867117e-07, "epoch": 7.619512195121951, "percentage": 76.2, "elapsed_time": "1:59:18", "remaining_time": "0:37:16"} -{"current_steps": 1563, "total_steps": 2050, "loss": 0.0116, "lr": 6.671221604243014e-07, "epoch": 7.624390243902439, "percentage": 76.24, "elapsed_time": "1:59:24", "remaining_time": "0:37:12"} -{"current_steps": 1564, "total_steps": 2050, "loss": 0.0506, "lr": 6.645188386641257e-07, "epoch": 7.6292682926829265, "percentage": 76.29, "elapsed_time": "1:59:27", "remaining_time": "0:37:07"} -{"current_steps": 1565, "total_steps": 2050, "loss": 0.002, "lr": 6.61919827545093e-07, "epoch": 7.634146341463414, "percentage": 76.34, "elapsed_time": "1:59:28", "remaining_time": "0:37:01"} -{"current_steps": 1566, "total_steps": 2050, "loss": 0.0673, "lr": 6.593251331709993e-07, "epoch": 7.639024390243902, "percentage": 76.39, "elapsed_time": "1:59:31", "remaining_time": "0:36:56"} -{"current_steps": 1567, "total_steps": 2050, "loss": 0.063, "lr": 6.567347616355049e-07, "epoch": 7.64390243902439, "percentage": 76.44, "elapsed_time": "1:59:32", "remaining_time": "0:36:50"} -{"current_steps": 1568, "total_steps": 2050, "loss": 0.003, "lr": 6.541487190221163e-07, "epoch": 7.648780487804878, "percentage": 76.49, "elapsed_time": "1:59:36", "remaining_time": "0:36:45"} -{"current_steps": 1569, "total_steps": 2050, "loss": 0.0037, "lr": 6.515670114041725e-07, "epoch": 7.6536585365853655, "percentage": 76.54, "elapsed_time": "1:59:37", "remaining_time": "0:36:40"} -{"current_steps": 1570, "total_steps": 2050, "loss": 0.0043, "lr": 6.489896448448349e-07, "epoch": 7.658536585365853, "percentage": 76.59, "elapsed_time": "1:59:41", "remaining_time": "0:36:35"} -{"current_steps": 1571, "total_steps": 2050, "loss": 0.0144, "lr": 6.464166253970672e-07, "epoch": 7.663414634146341, "percentage": 76.63, "elapsed_time": "1:59:43", "remaining_time": "0:36:30"} -{"current_steps": 1572, "total_steps": 2050, "loss": 0.0029, "lr": 6.43847959103624e-07, "epoch": 7.668292682926829, "percentage": 76.68, "elapsed_time": "1:59:45", "remaining_time": "0:36:24"} -{"current_steps": 1573, "total_steps": 2050, "loss": 0.0144, "lr": 6.412836519970383e-07, "epoch": 7.673170731707317, "percentage": 76.73, "elapsed_time": "1:59:48", "remaining_time": "0:36:19"} -{"current_steps": 1574, "total_steps": 2050, "loss": 0.0026, "lr": 6.387237100996041e-07, "epoch": 7.678048780487805, "percentage": 76.78, "elapsed_time": "1:59:51", "remaining_time": "0:36:14"} -{"current_steps": 1575, "total_steps": 2050, "loss": 0.0093, "lr": 6.361681394233631e-07, "epoch": 7.682926829268292, "percentage": 76.83, "elapsed_time": "1:59:55", "remaining_time": "0:36:09"} -{"current_steps": 1576, "total_steps": 2050, "loss": 0.0081, "lr": 6.336169459700933e-07, "epoch": 7.68780487804878, "percentage": 76.88, "elapsed_time": "2:00:00", "remaining_time": "0:36:05"} -{"current_steps": 1577, "total_steps": 2050, "loss": 0.0054, "lr": 6.310701357312909e-07, "epoch": 7.692682926829268, "percentage": 76.93, "elapsed_time": "2:00:04", "remaining_time": "0:36:00"} -{"current_steps": 1578, "total_steps": 2050, "loss": 0.0051, "lr": 6.285277146881588e-07, "epoch": 7.697560975609756, "percentage": 76.98, "elapsed_time": "2:00:05", "remaining_time": "0:35:55"} -{"current_steps": 1579, "total_steps": 2050, "loss": 0.0118, "lr": 6.259896888115904e-07, "epoch": 7.702439024390244, "percentage": 77.02, "elapsed_time": "2:00:07", "remaining_time": "0:35:49"} -{"current_steps": 1580, "total_steps": 2050, "loss": 0.009, "lr": 6.234560640621606e-07, "epoch": 7.7073170731707314, "percentage": 77.07, "elapsed_time": "2:00:12", "remaining_time": "0:35:45"} -{"current_steps": 1581, "total_steps": 2050, "loss": 0.0015, "lr": 6.209268463901047e-07, "epoch": 7.712195121951219, "percentage": 77.12, "elapsed_time": "2:00:13", "remaining_time": "0:35:39"} -{"current_steps": 1582, "total_steps": 2050, "loss": 0.0051, "lr": 6.184020417353084e-07, "epoch": 7.717073170731707, "percentage": 77.17, "elapsed_time": "2:00:16", "remaining_time": "0:35:34"} -{"current_steps": 1583, "total_steps": 2050, "loss": 0.0383, "lr": 6.158816560272962e-07, "epoch": 7.721951219512195, "percentage": 77.22, "elapsed_time": "2:00:18", "remaining_time": "0:35:29"} -{"current_steps": 1584, "total_steps": 2050, "loss": 0.0422, "lr": 6.133656951852113e-07, "epoch": 7.726829268292683, "percentage": 77.27, "elapsed_time": "2:00:22", "remaining_time": "0:35:24"} -{"current_steps": 1585, "total_steps": 2050, "loss": 0.0082, "lr": 6.10854165117806e-07, "epoch": 7.7317073170731705, "percentage": 77.32, "elapsed_time": "2:00:27", "remaining_time": "0:35:20"} -{"current_steps": 1586, "total_steps": 2050, "loss": 0.0052, "lr": 6.083470717234285e-07, "epoch": 7.736585365853658, "percentage": 77.37, "elapsed_time": "2:00:29", "remaining_time": "0:35:15"} -{"current_steps": 1587, "total_steps": 2050, "loss": 0.0094, "lr": 6.058444208900061e-07, "epoch": 7.741463414634146, "percentage": 77.41, "elapsed_time": "2:00:34", "remaining_time": "0:35:10"} -{"current_steps": 1588, "total_steps": 2050, "loss": 0.0034, "lr": 6.033462184950317e-07, "epoch": 7.746341463414634, "percentage": 77.46, "elapsed_time": "2:00:36", "remaining_time": "0:35:05"} -{"current_steps": 1589, "total_steps": 2050, "loss": 0.0141, "lr": 6.008524704055535e-07, "epoch": 7.751219512195122, "percentage": 77.51, "elapsed_time": "2:00:41", "remaining_time": "0:35:00"} -{"current_steps": 1590, "total_steps": 2050, "loss": 0.0108, "lr": 5.983631824781572e-07, "epoch": 7.7560975609756095, "percentage": 77.56, "elapsed_time": "2:00:45", "remaining_time": "0:34:56"} -{"current_steps": 1591, "total_steps": 2050, "loss": 0.0075, "lr": 5.95878360558953e-07, "epoch": 7.760975609756097, "percentage": 77.61, "elapsed_time": "2:00:47", "remaining_time": "0:34:50"} -{"current_steps": 1592, "total_steps": 2050, "loss": 0.018, "lr": 5.933980104835652e-07, "epoch": 7.765853658536585, "percentage": 77.66, "elapsed_time": "2:00:51", "remaining_time": "0:34:46"} -{"current_steps": 1593, "total_steps": 2050, "loss": 0.0207, "lr": 5.909221380771132e-07, "epoch": 7.770731707317073, "percentage": 77.71, "elapsed_time": "2:00:53", "remaining_time": "0:34:40"} -{"current_steps": 1594, "total_steps": 2050, "loss": 0.0217, "lr": 5.884507491542024e-07, "epoch": 7.775609756097561, "percentage": 77.76, "elapsed_time": "2:00:56", "remaining_time": "0:34:35"} -{"current_steps": 1595, "total_steps": 2050, "loss": 0.0055, "lr": 5.859838495189068e-07, "epoch": 7.780487804878049, "percentage": 77.8, "elapsed_time": "2:00:57", "remaining_time": "0:34:30"} -{"current_steps": 1596, "total_steps": 2050, "loss": 0.0147, "lr": 5.835214449647602e-07, "epoch": 7.785365853658536, "percentage": 77.85, "elapsed_time": "2:00:59", "remaining_time": "0:34:25"} -{"current_steps": 1597, "total_steps": 2050, "loss": 0.0065, "lr": 5.810635412747373e-07, "epoch": 7.790243902439024, "percentage": 77.9, "elapsed_time": "2:01:01", "remaining_time": "0:34:19"} -{"current_steps": 1598, "total_steps": 2050, "loss": 0.0077, "lr": 5.786101442212422e-07, "epoch": 7.795121951219512, "percentage": 77.95, "elapsed_time": "2:01:08", "remaining_time": "0:34:16"} -{"current_steps": 1599, "total_steps": 2050, "loss": 0.0699, "lr": 5.761612595660979e-07, "epoch": 7.8, "percentage": 78.0, "elapsed_time": "2:01:13", "remaining_time": "0:34:11"} -{"current_steps": 1600, "total_steps": 2050, "loss": 0.0017, "lr": 5.737168930605272e-07, "epoch": 7.804878048780488, "percentage": 78.05, "elapsed_time": "2:01:14", "remaining_time": "0:34:06"} -{"current_steps": 1601, "total_steps": 2050, "loss": 0.0101, "lr": 5.712770504451426e-07, "epoch": 7.809756097560975, "percentage": 78.1, "elapsed_time": "2:01:20", "remaining_time": "0:34:01"} -{"current_steps": 1602, "total_steps": 2050, "loss": 0.0143, "lr": 5.688417374499336e-07, "epoch": 7.814634146341463, "percentage": 78.15, "elapsed_time": "2:01:22", "remaining_time": "0:33:56"} -{"current_steps": 1603, "total_steps": 2050, "loss": 0.0062, "lr": 5.664109597942504e-07, "epoch": 7.819512195121951, "percentage": 78.2, "elapsed_time": "2:01:24", "remaining_time": "0:33:51"} -{"current_steps": 1604, "total_steps": 2050, "loss": 0.1058, "lr": 5.639847231867917e-07, "epoch": 7.824390243902439, "percentage": 78.24, "elapsed_time": "2:01:28", "remaining_time": "0:33:46"} -{"current_steps": 1605, "total_steps": 2050, "loss": 0.0178, "lr": 5.61563033325594e-07, "epoch": 7.829268292682927, "percentage": 78.29, "elapsed_time": "2:01:31", "remaining_time": "0:33:41"} -{"current_steps": 1606, "total_steps": 2050, "loss": 0.0204, "lr": 5.591458958980123e-07, "epoch": 7.8341463414634145, "percentage": 78.34, "elapsed_time": "2:01:35", "remaining_time": "0:33:36"} -{"current_steps": 1607, "total_steps": 2050, "loss": 0.0201, "lr": 5.567333165807115e-07, "epoch": 7.839024390243902, "percentage": 78.39, "elapsed_time": "2:01:41", "remaining_time": "0:33:32"} -{"current_steps": 1608, "total_steps": 2050, "loss": 0.0077, "lr": 5.543253010396538e-07, "epoch": 7.84390243902439, "percentage": 78.44, "elapsed_time": "2:01:45", "remaining_time": "0:33:28"} -{"current_steps": 1609, "total_steps": 2050, "loss": 0.0176, "lr": 5.519218549300806e-07, "epoch": 7.848780487804878, "percentage": 78.49, "elapsed_time": "2:01:50", "remaining_time": "0:33:23"} -{"current_steps": 1610, "total_steps": 2050, "loss": 0.031, "lr": 5.495229838965021e-07, "epoch": 7.853658536585366, "percentage": 78.54, "elapsed_time": "2:01:53", "remaining_time": "0:33:18"} -{"current_steps": 1611, "total_steps": 2050, "loss": 0.0062, "lr": 5.471286935726866e-07, "epoch": 7.8585365853658535, "percentage": 78.59, "elapsed_time": "2:01:58", "remaining_time": "0:33:14"} -{"current_steps": 1612, "total_steps": 2050, "loss": 0.0615, "lr": 5.447389895816416e-07, "epoch": 7.863414634146341, "percentage": 78.63, "elapsed_time": "2:02:05", "remaining_time": "0:33:10"} -{"current_steps": 1613, "total_steps": 2050, "loss": 0.0377, "lr": 5.423538775356049e-07, "epoch": 7.868292682926829, "percentage": 78.68, "elapsed_time": "2:02:08", "remaining_time": "0:33:05"} -{"current_steps": 1614, "total_steps": 2050, "loss": 0.0122, "lr": 5.399733630360287e-07, "epoch": 7.873170731707317, "percentage": 78.73, "elapsed_time": "2:02:11", "remaining_time": "0:33:00"} -{"current_steps": 1615, "total_steps": 2050, "loss": 0.015, "lr": 5.375974516735713e-07, "epoch": 7.878048780487805, "percentage": 78.78, "elapsed_time": "2:02:15", "remaining_time": "0:32:55"} -{"current_steps": 1616, "total_steps": 2050, "loss": 0.0058, "lr": 5.352261490280767e-07, "epoch": 7.882926829268293, "percentage": 78.83, "elapsed_time": "2:02:16", "remaining_time": "0:32:50"} -{"current_steps": 1617, "total_steps": 2050, "loss": 0.0041, "lr": 5.328594606685661e-07, "epoch": 7.88780487804878, "percentage": 78.88, "elapsed_time": "2:02:20", "remaining_time": "0:32:45"} -{"current_steps": 1618, "total_steps": 2050, "loss": 0.0067, "lr": 5.304973921532264e-07, "epoch": 7.892682926829268, "percentage": 78.93, "elapsed_time": "2:02:24", "remaining_time": "0:32:40"} -{"current_steps": 1619, "total_steps": 2050, "loss": 0.0064, "lr": 5.281399490293923e-07, "epoch": 7.897560975609756, "percentage": 78.98, "elapsed_time": "2:02:26", "remaining_time": "0:32:35"} -{"current_steps": 1620, "total_steps": 2050, "loss": 0.0182, "lr": 5.257871368335357e-07, "epoch": 7.902439024390244, "percentage": 79.02, "elapsed_time": "2:02:29", "remaining_time": "0:32:30"} -{"current_steps": 1621, "total_steps": 2050, "loss": 0.0024, "lr": 5.234389610912552e-07, "epoch": 7.907317073170732, "percentage": 79.07, "elapsed_time": "2:02:31", "remaining_time": "0:32:25"} -{"current_steps": 1622, "total_steps": 2050, "loss": 0.02, "lr": 5.210954273172578e-07, "epoch": 7.912195121951219, "percentage": 79.12, "elapsed_time": "2:02:32", "remaining_time": "0:32:20"} -{"current_steps": 1623, "total_steps": 2050, "loss": 0.024, "lr": 5.187565410153497e-07, "epoch": 7.917073170731707, "percentage": 79.17, "elapsed_time": "2:02:39", "remaining_time": "0:32:16"} -{"current_steps": 1624, "total_steps": 2050, "loss": 0.0103, "lr": 5.164223076784239e-07, "epoch": 7.921951219512195, "percentage": 79.22, "elapsed_time": "2:02:41", "remaining_time": "0:32:10"} -{"current_steps": 1625, "total_steps": 2050, "loss": 0.0268, "lr": 5.14092732788444e-07, "epoch": 7.926829268292683, "percentage": 79.27, "elapsed_time": "2:02:44", "remaining_time": "0:32:05"} -{"current_steps": 1626, "total_steps": 2050, "loss": 0.0085, "lr": 5.117678218164337e-07, "epoch": 7.931707317073171, "percentage": 79.32, "elapsed_time": "2:02:51", "remaining_time": "0:32:02"} -{"current_steps": 1627, "total_steps": 2050, "loss": 0.006, "lr": 5.094475802224644e-07, "epoch": 7.9365853658536585, "percentage": 79.37, "elapsed_time": "2:02:53", "remaining_time": "0:31:57"} -{"current_steps": 1628, "total_steps": 2050, "loss": 0.003, "lr": 5.071320134556404e-07, "epoch": 7.941463414634146, "percentage": 79.41, "elapsed_time": "2:02:55", "remaining_time": "0:31:51"} -{"current_steps": 1629, "total_steps": 2050, "loss": 0.0064, "lr": 5.048211269540868e-07, "epoch": 7.946341463414634, "percentage": 79.46, "elapsed_time": "2:03:02", "remaining_time": "0:31:47"} -{"current_steps": 1630, "total_steps": 2050, "loss": 0.0082, "lr": 5.025149261449391e-07, "epoch": 7.951219512195122, "percentage": 79.51, "elapsed_time": "2:03:04", "remaining_time": "0:31:42"} -{"current_steps": 1631, "total_steps": 2050, "loss": 0.0202, "lr": 5.002134164443262e-07, "epoch": 7.95609756097561, "percentage": 79.56, "elapsed_time": "2:03:08", "remaining_time": "0:31:37"} -{"current_steps": 1632, "total_steps": 2050, "loss": 0.0033, "lr": 4.979166032573607e-07, "epoch": 7.9609756097560975, "percentage": 79.61, "elapsed_time": "2:03:09", "remaining_time": "0:31:32"} -{"current_steps": 1633, "total_steps": 2050, "loss": 0.052, "lr": 4.956244919781247e-07, "epoch": 7.965853658536585, "percentage": 79.66, "elapsed_time": "2:03:13", "remaining_time": "0:31:28"} -{"current_steps": 1634, "total_steps": 2050, "loss": 0.0049, "lr": 4.933370879896604e-07, "epoch": 7.970731707317073, "percentage": 79.71, "elapsed_time": "2:03:16", "remaining_time": "0:31:23"} -{"current_steps": 1635, "total_steps": 2050, "loss": 0.0102, "lr": 4.91054396663952e-07, "epoch": 7.975609756097561, "percentage": 79.76, "elapsed_time": "2:03:22", "remaining_time": "0:31:18"} -{"current_steps": 1636, "total_steps": 2050, "loss": 0.0112, "lr": 4.887764233619163e-07, "epoch": 7.980487804878049, "percentage": 79.8, "elapsed_time": "2:03:24", "remaining_time": "0:31:13"} -{"current_steps": 1637, "total_steps": 2050, "loss": 0.0032, "lr": 4.865031734333919e-07, "epoch": 7.985365853658537, "percentage": 79.85, "elapsed_time": "2:03:28", "remaining_time": "0:31:09"} -{"current_steps": 1638, "total_steps": 2050, "loss": 0.012, "lr": 4.842346522171226e-07, "epoch": 7.990243902439024, "percentage": 79.9, "elapsed_time": "2:03:30", "remaining_time": "0:31:03"} -{"current_steps": 1639, "total_steps": 2050, "loss": 0.0184, "lr": 4.819708650407467e-07, "epoch": 7.995121951219512, "percentage": 79.95, "elapsed_time": "2:03:34", "remaining_time": "0:30:59"} -{"current_steps": 1640, "total_steps": 2050, "loss": 0.0112, "lr": 4.797118172207863e-07, "epoch": 8.0, "percentage": 80.0, "elapsed_time": "2:03:41", "remaining_time": "0:30:55"} -{"current_steps": 1641, "total_steps": 2050, "loss": 0.0064, "lr": 4.774575140626317e-07, "epoch": 8.004878048780487, "percentage": 80.05, "elapsed_time": "2:12:58", "remaining_time": "0:33:08"} -{"current_steps": 1642, "total_steps": 2050, "loss": 0.003, "lr": 4.752079608605295e-07, "epoch": 8.009756097560976, "percentage": 80.1, "elapsed_time": "2:12:59", "remaining_time": "0:33:02"} -{"current_steps": 1643, "total_steps": 2050, "loss": 0.0063, "lr": 4.7296316289757366e-07, "epoch": 8.014634146341463, "percentage": 80.15, "elapsed_time": "2:13:00", "remaining_time": "0:32:56"} -{"current_steps": 1644, "total_steps": 2050, "loss": 0.0039, "lr": 4.7072312544568844e-07, "epoch": 8.019512195121951, "percentage": 80.2, "elapsed_time": "2:13:03", "remaining_time": "0:32:51"} -{"current_steps": 1645, "total_steps": 2050, "loss": 0.0028, "lr": 4.6848785376561733e-07, "epoch": 8.024390243902438, "percentage": 80.24, "elapsed_time": "2:13:09", "remaining_time": "0:32:47"} -{"current_steps": 1646, "total_steps": 2050, "loss": 0.0021, "lr": 4.6625735310691396e-07, "epoch": 8.029268292682927, "percentage": 80.29, "elapsed_time": "2:13:15", "remaining_time": "0:32:42"} -{"current_steps": 1647, "total_steps": 2050, "loss": 0.005, "lr": 4.6403162870792524e-07, "epoch": 8.034146341463414, "percentage": 80.34, "elapsed_time": "2:13:19", "remaining_time": "0:32:37"} -{"current_steps": 1648, "total_steps": 2050, "loss": 0.0042, "lr": 4.618106857957805e-07, "epoch": 8.039024390243902, "percentage": 80.39, "elapsed_time": "2:13:21", "remaining_time": "0:32:31"} -{"current_steps": 1649, "total_steps": 2050, "loss": 0.0014, "lr": 4.5959452958638213e-07, "epoch": 8.04390243902439, "percentage": 80.44, "elapsed_time": "2:13:23", "remaining_time": "0:32:26"} -{"current_steps": 1650, "total_steps": 2050, "loss": 0.0012, "lr": 4.573831652843888e-07, "epoch": 8.048780487804878, "percentage": 80.49, "elapsed_time": "2:13:24", "remaining_time": "0:32:20"} -{"current_steps": 1651, "total_steps": 2050, "loss": 0.0234, "lr": 4.55176598083206e-07, "epoch": 8.053658536585365, "percentage": 80.54, "elapsed_time": "2:13:29", "remaining_time": "0:32:15"} -{"current_steps": 1652, "total_steps": 2050, "loss": 0.0042, "lr": 4.5297483316497276e-07, "epoch": 8.058536585365854, "percentage": 80.59, "elapsed_time": "2:13:32", "remaining_time": "0:32:10"} -{"current_steps": 1653, "total_steps": 2050, "loss": 0.0085, "lr": 4.5077787570055097e-07, "epoch": 8.06341463414634, "percentage": 80.63, "elapsed_time": "2:13:34", "remaining_time": "0:32:04"} -{"current_steps": 1654, "total_steps": 2050, "loss": 0.0628, "lr": 4.4858573084951173e-07, "epoch": 8.06829268292683, "percentage": 80.68, "elapsed_time": "2:13:36", "remaining_time": "0:31:59"} -{"current_steps": 1655, "total_steps": 2050, "loss": 0.0026, "lr": 4.463984037601224e-07, "epoch": 8.073170731707316, "percentage": 80.73, "elapsed_time": "2:13:41", "remaining_time": "0:31:54"} -{"current_steps": 1656, "total_steps": 2050, "loss": 0.0021, "lr": 4.4421589956933827e-07, "epoch": 8.078048780487805, "percentage": 80.78, "elapsed_time": "2:13:45", "remaining_time": "0:31:49"} -{"current_steps": 1657, "total_steps": 2050, "loss": 0.0042, "lr": 4.420382234027859e-07, "epoch": 8.082926829268292, "percentage": 80.83, "elapsed_time": "2:13:49", "remaining_time": "0:31:44"} -{"current_steps": 1658, "total_steps": 2050, "loss": 0.0045, "lr": 4.398653803747532e-07, "epoch": 8.08780487804878, "percentage": 80.88, "elapsed_time": "2:13:53", "remaining_time": "0:31:39"} -{"current_steps": 1659, "total_steps": 2050, "loss": 0.0015, "lr": 4.3769737558817996e-07, "epoch": 8.092682926829267, "percentage": 80.93, "elapsed_time": "2:13:53", "remaining_time": "0:31:33"} -{"current_steps": 1660, "total_steps": 2050, "loss": 0.0124, "lr": 4.355342141346405e-07, "epoch": 8.097560975609756, "percentage": 80.98, "elapsed_time": "2:13:56", "remaining_time": "0:31:28"} -{"current_steps": 1661, "total_steps": 2050, "loss": 0.002, "lr": 4.3337590109433505e-07, "epoch": 8.102439024390243, "percentage": 81.02, "elapsed_time": "2:14:02", "remaining_time": "0:31:23"} -{"current_steps": 1662, "total_steps": 2050, "loss": 0.0013, "lr": 4.3122244153607914e-07, "epoch": 8.107317073170732, "percentage": 81.07, "elapsed_time": "2:14:03", "remaining_time": "0:31:17"} -{"current_steps": 1663, "total_steps": 2050, "loss": 0.0201, "lr": 4.2907384051728754e-07, "epoch": 8.112195121951219, "percentage": 81.12, "elapsed_time": "2:14:06", "remaining_time": "0:31:12"} -{"current_steps": 1664, "total_steps": 2050, "loss": 0.0039, "lr": 4.2693010308396566e-07, "epoch": 8.117073170731707, "percentage": 81.17, "elapsed_time": "2:14:09", "remaining_time": "0:31:07"} -{"current_steps": 1665, "total_steps": 2050, "loss": 0.0035, "lr": 4.247912342706975e-07, "epoch": 8.121951219512194, "percentage": 81.22, "elapsed_time": "2:14:12", "remaining_time": "0:31:01"} -{"current_steps": 1666, "total_steps": 2050, "loss": 0.0009, "lr": 4.22657239100632e-07, "epoch": 8.126829268292683, "percentage": 81.27, "elapsed_time": "2:14:15", "remaining_time": "0:30:56"} -{"current_steps": 1667, "total_steps": 2050, "loss": 0.0018, "lr": 4.2052812258547265e-07, "epoch": 8.13170731707317, "percentage": 81.32, "elapsed_time": "2:14:20", "remaining_time": "0:30:51"} -{"current_steps": 1668, "total_steps": 2050, "loss": 0.0021, "lr": 4.184038897254655e-07, "epoch": 8.136585365853659, "percentage": 81.37, "elapsed_time": "2:14:26", "remaining_time": "0:30:47"} -{"current_steps": 1669, "total_steps": 2050, "loss": 0.019, "lr": 4.1628454550938697e-07, "epoch": 8.141463414634146, "percentage": 81.41, "elapsed_time": "2:14:28", "remaining_time": "0:30:41"} -{"current_steps": 1670, "total_steps": 2050, "loss": 0.0144, "lr": 4.141700949145322e-07, "epoch": 8.146341463414634, "percentage": 81.46, "elapsed_time": "2:14:31", "remaining_time": "0:30:36"} -{"current_steps": 1671, "total_steps": 2050, "loss": 0.0088, "lr": 4.1206054290670537e-07, "epoch": 8.151219512195121, "percentage": 81.51, "elapsed_time": "2:14:34", "remaining_time": "0:30:31"} -{"current_steps": 1672, "total_steps": 2050, "loss": 0.0083, "lr": 4.0995589444020433e-07, "epoch": 8.15609756097561, "percentage": 81.56, "elapsed_time": "2:14:37", "remaining_time": "0:30:26"} -{"current_steps": 1673, "total_steps": 2050, "loss": 0.0027, "lr": 4.0785615445781106e-07, "epoch": 8.160975609756097, "percentage": 81.61, "elapsed_time": "2:14:38", "remaining_time": "0:30:20"} -{"current_steps": 1674, "total_steps": 2050, "loss": 0.0089, "lr": 4.057613278907818e-07, "epoch": 8.165853658536586, "percentage": 81.66, "elapsed_time": "2:14:41", "remaining_time": "0:30:15"} -{"current_steps": 1675, "total_steps": 2050, "loss": 0.0034, "lr": 4.036714196588318e-07, "epoch": 8.170731707317072, "percentage": 81.71, "elapsed_time": "2:14:49", "remaining_time": "0:30:11"} -{"current_steps": 1676, "total_steps": 2050, "loss": 0.0021, "lr": 4.015864346701251e-07, "epoch": 8.175609756097561, "percentage": 81.76, "elapsed_time": "2:14:50", "remaining_time": "0:30:05"} -{"current_steps": 1677, "total_steps": 2050, "loss": 0.0005, "lr": 3.99506377821266e-07, "epoch": 8.180487804878048, "percentage": 81.8, "elapsed_time": "2:14:51", "remaining_time": "0:29:59"} -{"current_steps": 1678, "total_steps": 2050, "loss": 0.003, "lr": 3.97431253997283e-07, "epoch": 8.185365853658537, "percentage": 81.85, "elapsed_time": "2:14:54", "remaining_time": "0:29:54"} -{"current_steps": 1679, "total_steps": 2050, "loss": 0.0028, "lr": 3.9536106807161857e-07, "epoch": 8.190243902439024, "percentage": 81.9, "elapsed_time": "2:14:57", "remaining_time": "0:29:49"} -{"current_steps": 1680, "total_steps": 2050, "loss": 0.0097, "lr": 3.932958249061214e-07, "epoch": 8.195121951219512, "percentage": 81.95, "elapsed_time": "2:15:00", "remaining_time": "0:29:44"} -{"current_steps": 1681, "total_steps": 2050, "loss": 0.004, "lr": 3.9123552935102976e-07, "epoch": 8.2, "percentage": 82.0, "elapsed_time": "2:15:04", "remaining_time": "0:29:38"} -{"current_steps": 1682, "total_steps": 2050, "loss": 0.0189, "lr": 3.891801862449629e-07, "epoch": 8.204878048780488, "percentage": 82.05, "elapsed_time": "2:15:07", "remaining_time": "0:29:33"} -{"current_steps": 1683, "total_steps": 2050, "loss": 0.0038, "lr": 3.8712980041490905e-07, "epoch": 8.209756097560975, "percentage": 82.1, "elapsed_time": "2:15:10", "remaining_time": "0:29:28"} -{"current_steps": 1684, "total_steps": 2050, "loss": 0.0038, "lr": 3.850843766762155e-07, "epoch": 8.214634146341464, "percentage": 82.15, "elapsed_time": "2:15:17", "remaining_time": "0:29:24"} -{"current_steps": 1685, "total_steps": 2050, "loss": 0.0008, "lr": 3.830439198325747e-07, "epoch": 8.21951219512195, "percentage": 82.2, "elapsed_time": "2:15:18", "remaining_time": "0:29:18"} -{"current_steps": 1686, "total_steps": 2050, "loss": 0.0013, "lr": 3.81008434676014e-07, "epoch": 8.22439024390244, "percentage": 82.24, "elapsed_time": "2:15:22", "remaining_time": "0:29:13"} -{"current_steps": 1687, "total_steps": 2050, "loss": 0.0016, "lr": 3.789779259868864e-07, "epoch": 8.229268292682926, "percentage": 82.29, "elapsed_time": "2:15:24", "remaining_time": "0:29:08"} -{"current_steps": 1688, "total_steps": 2050, "loss": 0.0045, "lr": 3.769523985338566e-07, "epoch": 8.234146341463415, "percentage": 82.34, "elapsed_time": "2:15:26", "remaining_time": "0:29:02"} -{"current_steps": 1689, "total_steps": 2050, "loss": 0.0057, "lr": 3.749318570738897e-07, "epoch": 8.239024390243902, "percentage": 82.39, "elapsed_time": "2:15:28", "remaining_time": "0:28:57"} -{"current_steps": 1690, "total_steps": 2050, "loss": 0.0026, "lr": 3.7291630635224397e-07, "epoch": 8.24390243902439, "percentage": 82.44, "elapsed_time": "2:15:32", "remaining_time": "0:28:52"} -{"current_steps": 1691, "total_steps": 2050, "loss": 0.0056, "lr": 3.709057511024541e-07, "epoch": 8.248780487804877, "percentage": 82.49, "elapsed_time": "2:15:37", "remaining_time": "0:28:47"} -{"current_steps": 1692, "total_steps": 2050, "loss": 0.0019, "lr": 3.689001960463243e-07, "epoch": 8.253658536585366, "percentage": 82.54, "elapsed_time": "2:15:39", "remaining_time": "0:28:42"} -{"current_steps": 1693, "total_steps": 2050, "loss": 0.003, "lr": 3.668996458939156e-07, "epoch": 8.258536585365853, "percentage": 82.59, "elapsed_time": "2:15:44", "remaining_time": "0:28:37"} -{"current_steps": 1694, "total_steps": 2050, "loss": 0.0031, "lr": 3.649041053435354e-07, "epoch": 8.263414634146342, "percentage": 82.63, "elapsed_time": "2:15:47", "remaining_time": "0:28:32"} -{"current_steps": 1695, "total_steps": 2050, "loss": 0.0012, "lr": 3.62913579081724e-07, "epoch": 8.268292682926829, "percentage": 82.68, "elapsed_time": "2:15:50", "remaining_time": "0:28:27"} -{"current_steps": 1696, "total_steps": 2050, "loss": 0.0067, "lr": 3.609280717832489e-07, "epoch": 8.273170731707317, "percentage": 82.73, "elapsed_time": "2:15:53", "remaining_time": "0:28:21"} -{"current_steps": 1697, "total_steps": 2050, "loss": 0.002, "lr": 3.5894758811108795e-07, "epoch": 8.278048780487804, "percentage": 82.78, "elapsed_time": "2:15:56", "remaining_time": "0:28:16"} -{"current_steps": 1698, "total_steps": 2050, "loss": 0.0008, "lr": 3.5697213271642164e-07, "epoch": 8.282926829268293, "percentage": 82.83, "elapsed_time": "2:16:00", "remaining_time": "0:28:11"} -{"current_steps": 1699, "total_steps": 2050, "loss": 0.0028, "lr": 3.5500171023862136e-07, "epoch": 8.28780487804878, "percentage": 82.88, "elapsed_time": "2:16:06", "remaining_time": "0:28:07"} -{"current_steps": 1700, "total_steps": 2050, "loss": 0.0032, "lr": 3.530363253052399e-07, "epoch": 8.292682926829269, "percentage": 82.93, "elapsed_time": "2:16:08", "remaining_time": "0:28:01"} -{"current_steps": 1701, "total_steps": 2050, "loss": 0.0068, "lr": 3.510759825319976e-07, "epoch": 8.297560975609755, "percentage": 82.98, "elapsed_time": "2:16:12", "remaining_time": "0:27:56"} -{"current_steps": 1702, "total_steps": 2050, "loss": 0.0017, "lr": 3.491206865227739e-07, "epoch": 8.302439024390244, "percentage": 83.02, "elapsed_time": "2:16:16", "remaining_time": "0:27:51"} -{"current_steps": 1703, "total_steps": 2050, "loss": 0.0398, "lr": 3.4717044186959676e-07, "epoch": 8.307317073170731, "percentage": 83.07, "elapsed_time": "2:16:22", "remaining_time": "0:27:47"} -{"current_steps": 1704, "total_steps": 2050, "loss": 0.0049, "lr": 3.452252531526301e-07, "epoch": 8.31219512195122, "percentage": 83.12, "elapsed_time": "2:16:25", "remaining_time": "0:27:41"} -{"current_steps": 1705, "total_steps": 2050, "loss": 0.0164, "lr": 3.432851249401628e-07, "epoch": 8.317073170731707, "percentage": 83.17, "elapsed_time": "2:16:30", "remaining_time": "0:27:37"} -{"current_steps": 1706, "total_steps": 2050, "loss": 0.0038, "lr": 3.413500617886023e-07, "epoch": 8.321951219512195, "percentage": 83.22, "elapsed_time": "2:16:37", "remaining_time": "0:27:32"} -{"current_steps": 1707, "total_steps": 2050, "loss": 0.0118, "lr": 3.394200682424578e-07, "epoch": 8.326829268292682, "percentage": 83.27, "elapsed_time": "2:16:40", "remaining_time": "0:27:27"} -{"current_steps": 1708, "total_steps": 2050, "loss": 0.006, "lr": 3.374951488343328e-07, "epoch": 8.331707317073171, "percentage": 83.32, "elapsed_time": "2:16:42", "remaining_time": "0:27:22"} -{"current_steps": 1709, "total_steps": 2050, "loss": 0.0011, "lr": 3.355753080849164e-07, "epoch": 8.336585365853658, "percentage": 83.37, "elapsed_time": "2:16:46", "remaining_time": "0:27:17"} -{"current_steps": 1710, "total_steps": 2050, "loss": 0.0024, "lr": 3.3366055050296776e-07, "epoch": 8.341463414634147, "percentage": 83.41, "elapsed_time": "2:16:51", "remaining_time": "0:27:12"} -{"current_steps": 1711, "total_steps": 2050, "loss": 0.0028, "lr": 3.3175088058530925e-07, "epoch": 8.346341463414634, "percentage": 83.46, "elapsed_time": "2:16:57", "remaining_time": "0:27:08"} -{"current_steps": 1712, "total_steps": 2050, "loss": 0.0109, "lr": 3.2984630281681556e-07, "epoch": 8.351219512195122, "percentage": 83.51, "elapsed_time": "2:16:58", "remaining_time": "0:27:02"} -{"current_steps": 1713, "total_steps": 2050, "loss": 0.0031, "lr": 3.2794682167040125e-07, "epoch": 8.35609756097561, "percentage": 83.56, "elapsed_time": "2:17:01", "remaining_time": "0:26:57"} -{"current_steps": 1714, "total_steps": 2050, "loss": 0.01, "lr": 3.2605244160701155e-07, "epoch": 8.360975609756098, "percentage": 83.61, "elapsed_time": "2:17:06", "remaining_time": "0:26:52"} -{"current_steps": 1715, "total_steps": 2050, "loss": 0.0042, "lr": 3.2416316707561316e-07, "epoch": 8.365853658536585, "percentage": 83.66, "elapsed_time": "2:17:11", "remaining_time": "0:26:47"} -{"current_steps": 1716, "total_steps": 2050, "loss": 0.0021, "lr": 3.2227900251318055e-07, "epoch": 8.370731707317074, "percentage": 83.71, "elapsed_time": "2:17:15", "remaining_time": "0:26:42"} -{"current_steps": 1717, "total_steps": 2050, "loss": 0.0031, "lr": 3.2039995234468854e-07, "epoch": 8.37560975609756, "percentage": 83.76, "elapsed_time": "2:17:16", "remaining_time": "0:26:37"} -{"current_steps": 1718, "total_steps": 2050, "loss": 0.0009, "lr": 3.1852602098309984e-07, "epoch": 8.38048780487805, "percentage": 83.8, "elapsed_time": "2:17:18", "remaining_time": "0:26:32"} -{"current_steps": 1719, "total_steps": 2050, "loss": 0.0047, "lr": 3.1665721282935683e-07, "epoch": 8.385365853658536, "percentage": 83.85, "elapsed_time": "2:17:25", "remaining_time": "0:26:27"} -{"current_steps": 1720, "total_steps": 2050, "loss": 0.0026, "lr": 3.147935322723694e-07, "epoch": 8.390243902439025, "percentage": 83.9, "elapsed_time": "2:17:31", "remaining_time": "0:26:23"} -{"current_steps": 1721, "total_steps": 2050, "loss": 0.0019, "lr": 3.1293498368900414e-07, "epoch": 8.395121951219512, "percentage": 83.95, "elapsed_time": "2:17:32", "remaining_time": "0:26:17"} -{"current_steps": 1722, "total_steps": 2050, "loss": 0.0009, "lr": 3.1108157144407765e-07, "epoch": 8.4, "percentage": 84.0, "elapsed_time": "2:17:33", "remaining_time": "0:26:12"} -{"current_steps": 1723, "total_steps": 2050, "loss": 0.0047, "lr": 3.092332998903416e-07, "epoch": 8.404878048780487, "percentage": 84.05, "elapsed_time": "2:17:36", "remaining_time": "0:26:06"} -{"current_steps": 1724, "total_steps": 2050, "loss": 0.0162, "lr": 3.073901733684748e-07, "epoch": 8.409756097560976, "percentage": 84.1, "elapsed_time": "2:17:41", "remaining_time": "0:26:02"} -{"current_steps": 1725, "total_steps": 2050, "loss": 0.0078, "lr": 3.055521962070751e-07, "epoch": 8.414634146341463, "percentage": 84.15, "elapsed_time": "2:17:46", "remaining_time": "0:25:57"} -{"current_steps": 1726, "total_steps": 2050, "loss": 0.0035, "lr": 3.0371937272264454e-07, "epoch": 8.419512195121952, "percentage": 84.2, "elapsed_time": "2:17:48", "remaining_time": "0:25:52"} -{"current_steps": 1727, "total_steps": 2050, "loss": 0.0011, "lr": 3.0189170721958234e-07, "epoch": 8.424390243902439, "percentage": 84.24, "elapsed_time": "2:17:51", "remaining_time": "0:25:47"} -{"current_steps": 1728, "total_steps": 2050, "loss": 0.0042, "lr": 3.000692039901756e-07, "epoch": 8.429268292682927, "percentage": 84.29, "elapsed_time": "2:17:55", "remaining_time": "0:25:41"} -{"current_steps": 1729, "total_steps": 2050, "loss": 0.0159, "lr": 2.982518673145862e-07, "epoch": 8.434146341463414, "percentage": 84.34, "elapsed_time": "2:18:01", "remaining_time": "0:25:37"} -{"current_steps": 1730, "total_steps": 2050, "loss": 0.0021, "lr": 2.9643970146084193e-07, "epoch": 8.439024390243903, "percentage": 84.39, "elapsed_time": "2:18:03", "remaining_time": "0:25:32"} -{"current_steps": 1731, "total_steps": 2050, "loss": 0.0124, "lr": 2.9463271068482955e-07, "epoch": 8.44390243902439, "percentage": 84.44, "elapsed_time": "2:18:05", "remaining_time": "0:25:26"} -{"current_steps": 1732, "total_steps": 2050, "loss": 0.0038, "lr": 2.928308992302792e-07, "epoch": 8.448780487804878, "percentage": 84.49, "elapsed_time": "2:18:11", "remaining_time": "0:25:22"} -{"current_steps": 1733, "total_steps": 2050, "loss": 0.0013, "lr": 2.9103427132875785e-07, "epoch": 8.453658536585365, "percentage": 84.54, "elapsed_time": "2:18:14", "remaining_time": "0:25:17"} -{"current_steps": 1734, "total_steps": 2050, "loss": 0.0151, "lr": 2.892428311996609e-07, "epoch": 8.458536585365854, "percentage": 84.59, "elapsed_time": "2:18:17", "remaining_time": "0:25:12"} -{"current_steps": 1735, "total_steps": 2050, "loss": 0.0037, "lr": 2.8745658305019824e-07, "epoch": 8.463414634146341, "percentage": 84.63, "elapsed_time": "2:18:21", "remaining_time": "0:25:07"} -{"current_steps": 1736, "total_steps": 2050, "loss": 0.0046, "lr": 2.856755310753867e-07, "epoch": 8.46829268292683, "percentage": 84.68, "elapsed_time": "2:18:28", "remaining_time": "0:25:02"} -{"current_steps": 1737, "total_steps": 2050, "loss": 0.0014, "lr": 2.8389967945803984e-07, "epoch": 8.473170731707317, "percentage": 84.73, "elapsed_time": "2:18:29", "remaining_time": "0:24:57"} -{"current_steps": 1738, "total_steps": 2050, "loss": 0.0036, "lr": 2.821290323687592e-07, "epoch": 8.478048780487805, "percentage": 84.78, "elapsed_time": "2:18:37", "remaining_time": "0:24:53"} -{"current_steps": 1739, "total_steps": 2050, "loss": 0.0016, "lr": 2.803635939659222e-07, "epoch": 8.482926829268292, "percentage": 84.83, "elapsed_time": "2:18:40", "remaining_time": "0:24:48"} -{"current_steps": 1740, "total_steps": 2050, "loss": 0.0052, "lr": 2.786033683956732e-07, "epoch": 8.487804878048781, "percentage": 84.88, "elapsed_time": "2:18:44", "remaining_time": "0:24:43"} -{"current_steps": 1741, "total_steps": 2050, "loss": 0.0999, "lr": 2.7684835979191664e-07, "epoch": 8.492682926829268, "percentage": 84.93, "elapsed_time": "2:18:47", "remaining_time": "0:24:37"} -{"current_steps": 1742, "total_steps": 2050, "loss": 0.0009, "lr": 2.7509857227630223e-07, "epoch": 8.497560975609757, "percentage": 84.98, "elapsed_time": "2:18:49", "remaining_time": "0:24:32"} -{"current_steps": 1743, "total_steps": 2050, "loss": 0.0026, "lr": 2.733540099582188e-07, "epoch": 8.502439024390243, "percentage": 85.02, "elapsed_time": "2:18:51", "remaining_time": "0:24:27"} -{"current_steps": 1744, "total_steps": 2050, "loss": 0.0094, "lr": 2.7161467693478493e-07, "epoch": 8.507317073170732, "percentage": 85.07, "elapsed_time": "2:18:53", "remaining_time": "0:24:22"} -{"current_steps": 1745, "total_steps": 2050, "loss": 0.006, "lr": 2.6988057729083613e-07, "epoch": 8.512195121951219, "percentage": 85.12, "elapsed_time": "2:18:58", "remaining_time": "0:24:17"} -{"current_steps": 1746, "total_steps": 2050, "loss": 0.0043, "lr": 2.681517150989185e-07, "epoch": 8.517073170731708, "percentage": 85.17, "elapsed_time": "2:19:03", "remaining_time": "0:24:12"} -{"current_steps": 1747, "total_steps": 2050, "loss": 0.0026, "lr": 2.664280944192782e-07, "epoch": 8.521951219512195, "percentage": 85.22, "elapsed_time": "2:19:09", "remaining_time": "0:24:08"} -{"current_steps": 1748, "total_steps": 2050, "loss": 0.0044, "lr": 2.64709719299851e-07, "epoch": 8.526829268292683, "percentage": 85.27, "elapsed_time": "2:19:15", "remaining_time": "0:24:03"} -{"current_steps": 1749, "total_steps": 2050, "loss": 0.0008, "lr": 2.6299659377625296e-07, "epoch": 8.53170731707317, "percentage": 85.32, "elapsed_time": "2:19:16", "remaining_time": "0:23:58"} -{"current_steps": 1750, "total_steps": 2050, "loss": 0.0324, "lr": 2.612887218717733e-07, "epoch": 8.536585365853659, "percentage": 85.37, "elapsed_time": "2:19:19", "remaining_time": "0:23:53"} -{"current_steps": 1751, "total_steps": 2050, "loss": 0.0028, "lr": 2.5958610759736133e-07, "epoch": 8.541463414634146, "percentage": 85.41, "elapsed_time": "2:19:25", "remaining_time": "0:23:48"} -{"current_steps": 1752, "total_steps": 2050, "loss": 0.0019, "lr": 2.5788875495161846e-07, "epoch": 8.546341463414635, "percentage": 85.46, "elapsed_time": "2:19:26", "remaining_time": "0:23:43"} -{"current_steps": 1753, "total_steps": 2050, "loss": 0.0028, "lr": 2.561966679207917e-07, "epoch": 8.551219512195122, "percentage": 85.51, "elapsed_time": "2:19:32", "remaining_time": "0:23:38"} -{"current_steps": 1754, "total_steps": 2050, "loss": 0.0266, "lr": 2.545098504787588e-07, "epoch": 8.55609756097561, "percentage": 85.56, "elapsed_time": "2:19:36", "remaining_time": "0:23:33"} -{"current_steps": 1755, "total_steps": 2050, "loss": 0.0009, "lr": 2.5282830658702323e-07, "epoch": 8.560975609756097, "percentage": 85.61, "elapsed_time": "2:19:37", "remaining_time": "0:23:28"} -{"current_steps": 1756, "total_steps": 2050, "loss": 0.0056, "lr": 2.511520401947032e-07, "epoch": 8.565853658536586, "percentage": 85.66, "elapsed_time": "2:19:43", "remaining_time": "0:23:23"} -{"current_steps": 1757, "total_steps": 2050, "loss": 0.0009, "lr": 2.494810552385232e-07, "epoch": 8.570731707317073, "percentage": 85.71, "elapsed_time": "2:19:45", "remaining_time": "0:23:18"} -{"current_steps": 1758, "total_steps": 2050, "loss": 0.0023, "lr": 2.47815355642804e-07, "epoch": 8.575609756097561, "percentage": 85.76, "elapsed_time": "2:19:48", "remaining_time": "0:23:13"} -{"current_steps": 1759, "total_steps": 2050, "loss": 0.0454, "lr": 2.461549453194523e-07, "epoch": 8.580487804878048, "percentage": 85.8, "elapsed_time": "2:19:53", "remaining_time": "0:23:08"} -{"current_steps": 1760, "total_steps": 2050, "loss": 0.0204, "lr": 2.444998281679553e-07, "epoch": 8.585365853658537, "percentage": 85.85, "elapsed_time": "2:19:56", "remaining_time": "0:23:03"} -{"current_steps": 1761, "total_steps": 2050, "loss": 0.0387, "lr": 2.428500080753676e-07, "epoch": 8.590243902439024, "percentage": 85.9, "elapsed_time": "2:19:59", "remaining_time": "0:22:58"} -{"current_steps": 1762, "total_steps": 2050, "loss": 0.0014, "lr": 2.412054889163035e-07, "epoch": 8.595121951219513, "percentage": 85.95, "elapsed_time": "2:20:01", "remaining_time": "0:22:53"} -{"current_steps": 1763, "total_steps": 2050, "loss": 0.0011, "lr": 2.3956627455292924e-07, "epoch": 8.6, "percentage": 86.0, "elapsed_time": "2:20:04", "remaining_time": "0:22:48"} -{"current_steps": 1764, "total_steps": 2050, "loss": 0.003, "lr": 2.3793236883495164e-07, "epoch": 8.604878048780488, "percentage": 86.05, "elapsed_time": "2:20:08", "remaining_time": "0:22:43"} -{"current_steps": 1765, "total_steps": 2050, "loss": 0.0032, "lr": 2.363037755996095e-07, "epoch": 8.609756097560975, "percentage": 86.1, "elapsed_time": "2:20:14", "remaining_time": "0:22:38"} -{"current_steps": 1766, "total_steps": 2050, "loss": 0.0037, "lr": 2.3468049867166747e-07, "epoch": 8.614634146341464, "percentage": 86.15, "elapsed_time": "2:20:16", "remaining_time": "0:22:33"} -{"current_steps": 1767, "total_steps": 2050, "loss": 0.0014, "lr": 2.3306254186340305e-07, "epoch": 8.61951219512195, "percentage": 86.2, "elapsed_time": "2:20:18", "remaining_time": "0:22:28"} -{"current_steps": 1768, "total_steps": 2050, "loss": 0.0125, "lr": 2.314499089745989e-07, "epoch": 8.62439024390244, "percentage": 86.24, "elapsed_time": "2:20:22", "remaining_time": "0:22:23"} -{"current_steps": 1769, "total_steps": 2050, "loss": 0.0855, "lr": 2.2984260379253693e-07, "epoch": 8.629268292682926, "percentage": 86.29, "elapsed_time": "2:20:26", "remaining_time": "0:22:18"} -{"current_steps": 1770, "total_steps": 2050, "loss": 0.0031, "lr": 2.2824063009198428e-07, "epoch": 8.634146341463415, "percentage": 86.34, "elapsed_time": "2:20:30", "remaining_time": "0:22:13"} -{"current_steps": 1771, "total_steps": 2050, "loss": 0.0056, "lr": 2.2664399163518786e-07, "epoch": 8.639024390243902, "percentage": 86.39, "elapsed_time": "2:20:34", "remaining_time": "0:22:08"} -{"current_steps": 1772, "total_steps": 2050, "loss": 0.022, "lr": 2.25052692171866e-07, "epoch": 8.64390243902439, "percentage": 86.44, "elapsed_time": "2:20:38", "remaining_time": "0:22:03"} -{"current_steps": 1773, "total_steps": 2050, "loss": 0.0025, "lr": 2.2346673543919645e-07, "epoch": 8.648780487804878, "percentage": 86.49, "elapsed_time": "2:20:39", "remaining_time": "0:21:58"} -{"current_steps": 1774, "total_steps": 2050, "loss": 0.0081, "lr": 2.2188612516181067e-07, "epoch": 8.653658536585366, "percentage": 86.54, "elapsed_time": "2:20:42", "remaining_time": "0:21:53"} -{"current_steps": 1775, "total_steps": 2050, "loss": 0.0015, "lr": 2.203108650517835e-07, "epoch": 8.658536585365853, "percentage": 86.59, "elapsed_time": "2:20:47", "remaining_time": "0:21:48"} -{"current_steps": 1776, "total_steps": 2050, "loss": 0.0023, "lr": 2.1874095880862505e-07, "epoch": 8.663414634146342, "percentage": 86.63, "elapsed_time": "2:20:48", "remaining_time": "0:21:43"} -{"current_steps": 1777, "total_steps": 2050, "loss": 0.002, "lr": 2.171764101192722e-07, "epoch": 8.668292682926829, "percentage": 86.68, "elapsed_time": "2:20:50", "remaining_time": "0:21:38"} -{"current_steps": 1778, "total_steps": 2050, "loss": 0.002, "lr": 2.1561722265807827e-07, "epoch": 8.673170731707318, "percentage": 86.73, "elapsed_time": "2:20:55", "remaining_time": "0:21:33"} -{"current_steps": 1779, "total_steps": 2050, "loss": 0.0015, "lr": 2.1406340008680748e-07, "epoch": 8.678048780487805, "percentage": 86.78, "elapsed_time": "2:20:59", "remaining_time": "0:21:28"} -{"current_steps": 1780, "total_steps": 2050, "loss": 0.0028, "lr": 2.1251494605462358e-07, "epoch": 8.682926829268293, "percentage": 86.83, "elapsed_time": "2:21:01", "remaining_time": "0:21:23"} -{"current_steps": 1781, "total_steps": 2050, "loss": 0.0008, "lr": 2.1097186419808151e-07, "epoch": 8.68780487804878, "percentage": 86.88, "elapsed_time": "2:21:02", "remaining_time": "0:21:18"} -{"current_steps": 1782, "total_steps": 2050, "loss": 0.0012, "lr": 2.094341581411216e-07, "epoch": 8.692682926829269, "percentage": 86.93, "elapsed_time": "2:21:08", "remaining_time": "0:21:13"} -{"current_steps": 1783, "total_steps": 2050, "loss": 0.0021, "lr": 2.0790183149505733e-07, "epoch": 8.697560975609756, "percentage": 86.98, "elapsed_time": "2:21:15", "remaining_time": "0:21:09"} -{"current_steps": 1784, "total_steps": 2050, "loss": 0.0028, "lr": 2.063748878585689e-07, "epoch": 8.702439024390245, "percentage": 87.02, "elapsed_time": "2:21:18", "remaining_time": "0:21:04"} -{"current_steps": 1785, "total_steps": 2050, "loss": 0.0018, "lr": 2.0485333081769588e-07, "epoch": 8.707317073170731, "percentage": 87.07, "elapsed_time": "2:21:19", "remaining_time": "0:20:58"} -{"current_steps": 1786, "total_steps": 2050, "loss": 0.0142, "lr": 2.0333716394582536e-07, "epoch": 8.71219512195122, "percentage": 87.12, "elapsed_time": "2:21:23", "remaining_time": "0:20:53"} -{"current_steps": 1787, "total_steps": 2050, "loss": 0.0135, "lr": 2.0182639080368634e-07, "epoch": 8.717073170731707, "percentage": 87.17, "elapsed_time": "2:21:27", "remaining_time": "0:20:49"} -{"current_steps": 1788, "total_steps": 2050, "loss": 0.0078, "lr": 2.003210149393417e-07, "epoch": 8.721951219512196, "percentage": 87.22, "elapsed_time": "2:21:30", "remaining_time": "0:20:44"} -{"current_steps": 1789, "total_steps": 2050, "loss": 0.0066, "lr": 1.9882103988817735e-07, "epoch": 8.726829268292683, "percentage": 87.27, "elapsed_time": "2:21:33", "remaining_time": "0:20:39"} -{"current_steps": 1790, "total_steps": 2050, "loss": 0.0051, "lr": 1.9732646917289545e-07, "epoch": 8.731707317073171, "percentage": 87.32, "elapsed_time": "2:21:39", "remaining_time": "0:20:34"} -{"current_steps": 1791, "total_steps": 2050, "loss": 0.001, "lr": 1.958373063035071e-07, "epoch": 8.736585365853658, "percentage": 87.37, "elapsed_time": "2:21:44", "remaining_time": "0:20:29"} -{"current_steps": 1792, "total_steps": 2050, "loss": 0.0057, "lr": 1.9435355477732205e-07, "epoch": 8.741463414634147, "percentage": 87.41, "elapsed_time": "2:21:49", "remaining_time": "0:20:25"} -{"current_steps": 1793, "total_steps": 2050, "loss": 0.0023, "lr": 1.928752180789417e-07, "epoch": 8.746341463414634, "percentage": 87.46, "elapsed_time": "2:21:55", "remaining_time": "0:20:20"} -{"current_steps": 1794, "total_steps": 2050, "loss": 0.0191, "lr": 1.9140229968025058e-07, "epoch": 8.751219512195123, "percentage": 87.51, "elapsed_time": "2:21:58", "remaining_time": "0:20:15"} -{"current_steps": 1795, "total_steps": 2050, "loss": 0.0114, "lr": 1.8993480304040912e-07, "epoch": 8.75609756097561, "percentage": 87.56, "elapsed_time": "2:22:02", "remaining_time": "0:20:10"} -{"current_steps": 1796, "total_steps": 2050, "loss": 0.0046, "lr": 1.8847273160584378e-07, "epoch": 8.760975609756098, "percentage": 87.61, "elapsed_time": "2:22:04", "remaining_time": "0:20:05"} -{"current_steps": 1797, "total_steps": 2050, "loss": 0.0014, "lr": 1.8701608881023957e-07, "epoch": 8.765853658536585, "percentage": 87.66, "elapsed_time": "2:22:06", "remaining_time": "0:20:00"} -{"current_steps": 1798, "total_steps": 2050, "loss": 0.0085, "lr": 1.855648780745342e-07, "epoch": 8.770731707317074, "percentage": 87.71, "elapsed_time": "2:22:11", "remaining_time": "0:19:55"} -{"current_steps": 1799, "total_steps": 2050, "loss": 0.0034, "lr": 1.8411910280690588e-07, "epoch": 8.77560975609756, "percentage": 87.76, "elapsed_time": "2:22:16", "remaining_time": "0:19:50"} -{"current_steps": 1800, "total_steps": 2050, "loss": 0.0119, "lr": 1.826787664027685e-07, "epoch": 8.78048780487805, "percentage": 87.8, "elapsed_time": "2:22:19", "remaining_time": "0:19:46"} -{"current_steps": 1801, "total_steps": 2050, "loss": 0.0059, "lr": 1.8124387224476347e-07, "epoch": 8.785365853658536, "percentage": 87.85, "elapsed_time": "2:22:26", "remaining_time": "0:19:41"} -{"current_steps": 1802, "total_steps": 2050, "loss": 0.0021, "lr": 1.7981442370274993e-07, "epoch": 8.790243902439025, "percentage": 87.9, "elapsed_time": "2:22:27", "remaining_time": "0:19:36"} -{"current_steps": 1803, "total_steps": 2050, "loss": 0.0085, "lr": 1.783904241337983e-07, "epoch": 8.795121951219512, "percentage": 87.95, "elapsed_time": "2:22:29", "remaining_time": "0:19:31"} -{"current_steps": 1804, "total_steps": 2050, "loss": 0.0037, "lr": 1.7697187688218291e-07, "epoch": 8.8, "percentage": 88.0, "elapsed_time": "2:22:33", "remaining_time": "0:19:26"} -{"current_steps": 1805, "total_steps": 2050, "loss": 0.0008, "lr": 1.7555878527937164e-07, "epoch": 8.804878048780488, "percentage": 88.05, "elapsed_time": "2:22:35", "remaining_time": "0:19:21"} -{"current_steps": 1806, "total_steps": 2050, "loss": 0.0092, "lr": 1.7415115264402065e-07, "epoch": 8.809756097560976, "percentage": 88.1, "elapsed_time": "2:22:40", "remaining_time": "0:19:16"} -{"current_steps": 1807, "total_steps": 2050, "loss": 0.0016, "lr": 1.727489822819664e-07, "epoch": 8.814634146341463, "percentage": 88.15, "elapsed_time": "2:22:42", "remaining_time": "0:19:11"} -{"current_steps": 1808, "total_steps": 2050, "loss": 0.0012, "lr": 1.7135227748621585e-07, "epoch": 8.819512195121952, "percentage": 88.2, "elapsed_time": "2:22:45", "remaining_time": "0:19:06"} -{"current_steps": 1809, "total_steps": 2050, "loss": 0.0126, "lr": 1.699610415369407e-07, "epoch": 8.824390243902439, "percentage": 88.24, "elapsed_time": "2:22:48", "remaining_time": "0:19:01"} -{"current_steps": 1810, "total_steps": 2050, "loss": 0.0086, "lr": 1.6857527770146876e-07, "epoch": 8.829268292682928, "percentage": 88.29, "elapsed_time": "2:22:51", "remaining_time": "0:18:56"} -{"current_steps": 1811, "total_steps": 2050, "loss": 0.0031, "lr": 1.6719498923427697e-07, "epoch": 8.834146341463414, "percentage": 88.34, "elapsed_time": "2:22:54", "remaining_time": "0:18:51"} -{"current_steps": 1812, "total_steps": 2050, "loss": 0.0083, "lr": 1.6582017937698287e-07, "epoch": 8.839024390243903, "percentage": 88.39, "elapsed_time": "2:22:57", "remaining_time": "0:18:46"} -{"current_steps": 1813, "total_steps": 2050, "loss": 0.002, "lr": 1.6445085135833732e-07, "epoch": 8.84390243902439, "percentage": 88.44, "elapsed_time": "2:22:59", "remaining_time": "0:18:41"} -{"current_steps": 1814, "total_steps": 2050, "loss": 0.0027, "lr": 1.6308700839421793e-07, "epoch": 8.848780487804879, "percentage": 88.49, "elapsed_time": "2:23:01", "remaining_time": "0:18:36"} -{"current_steps": 1815, "total_steps": 2050, "loss": 0.0028, "lr": 1.6172865368762004e-07, "epoch": 8.853658536585366, "percentage": 88.54, "elapsed_time": "2:23:03", "remaining_time": "0:18:31"} -{"current_steps": 1816, "total_steps": 2050, "loss": 0.0011, "lr": 1.6037579042864876e-07, "epoch": 8.858536585365854, "percentage": 88.59, "elapsed_time": "2:23:04", "remaining_time": "0:18:26"} -{"current_steps": 1817, "total_steps": 2050, "loss": 0.0082, "lr": 1.5902842179451482e-07, "epoch": 8.863414634146341, "percentage": 88.63, "elapsed_time": "2:23:08", "remaining_time": "0:18:21"} -{"current_steps": 1818, "total_steps": 2050, "loss": 0.0068, "lr": 1.576865509495229e-07, "epoch": 8.86829268292683, "percentage": 88.68, "elapsed_time": "2:23:11", "remaining_time": "0:18:16"} -{"current_steps": 1819, "total_steps": 2050, "loss": 0.0085, "lr": 1.5635018104506627e-07, "epoch": 8.873170731707317, "percentage": 88.73, "elapsed_time": "2:23:13", "remaining_time": "0:18:11"} -{"current_steps": 1820, "total_steps": 2050, "loss": 0.0062, "lr": 1.5501931521962055e-07, "epoch": 8.878048780487806, "percentage": 88.78, "elapsed_time": "2:23:20", "remaining_time": "0:18:06"} -{"current_steps": 1821, "total_steps": 2050, "loss": 0.0043, "lr": 1.5369395659873305e-07, "epoch": 8.882926829268293, "percentage": 88.83, "elapsed_time": "2:23:22", "remaining_time": "0:18:01"} -{"current_steps": 1822, "total_steps": 2050, "loss": 0.0042, "lr": 1.5237410829501864e-07, "epoch": 8.887804878048781, "percentage": 88.88, "elapsed_time": "2:23:25", "remaining_time": "0:17:56"} -{"current_steps": 1823, "total_steps": 2050, "loss": 0.0077, "lr": 1.510597734081512e-07, "epoch": 8.892682926829268, "percentage": 88.93, "elapsed_time": "2:23:28", "remaining_time": "0:17:51"} -{"current_steps": 1824, "total_steps": 2050, "loss": 0.0013, "lr": 1.497509550248555e-07, "epoch": 8.897560975609757, "percentage": 88.98, "elapsed_time": "2:23:30", "remaining_time": "0:17:46"} -{"current_steps": 1825, "total_steps": 2050, "loss": 0.0091, "lr": 1.4844765621890135e-07, "epoch": 8.902439024390244, "percentage": 89.02, "elapsed_time": "2:23:33", "remaining_time": "0:17:41"} -{"current_steps": 1826, "total_steps": 2050, "loss": 0.005, "lr": 1.471498800510962e-07, "epoch": 8.907317073170733, "percentage": 89.07, "elapsed_time": "2:23:35", "remaining_time": "0:17:36"} -{"current_steps": 1827, "total_steps": 2050, "loss": 0.0014, "lr": 1.4585762956927624e-07, "epoch": 8.91219512195122, "percentage": 89.12, "elapsed_time": "2:23:40", "remaining_time": "0:17:32"} -{"current_steps": 1828, "total_steps": 2050, "loss": 0.0063, "lr": 1.4457090780830185e-07, "epoch": 8.917073170731708, "percentage": 89.17, "elapsed_time": "2:23:44", "remaining_time": "0:17:27"} -{"current_steps": 1829, "total_steps": 2050, "loss": 0.0065, "lr": 1.432897177900483e-07, "epoch": 8.921951219512195, "percentage": 89.22, "elapsed_time": "2:23:46", "remaining_time": "0:17:22"} -{"current_steps": 1830, "total_steps": 2050, "loss": 0.0099, "lr": 1.4201406252340038e-07, "epoch": 8.926829268292684, "percentage": 89.27, "elapsed_time": "2:23:48", "remaining_time": "0:17:17"} -{"current_steps": 1831, "total_steps": 2050, "loss": 0.0042, "lr": 1.407439450042433e-07, "epoch": 8.93170731707317, "percentage": 89.32, "elapsed_time": "2:23:50", "remaining_time": "0:17:12"} -{"current_steps": 1832, "total_steps": 2050, "loss": 0.004, "lr": 1.3947936821545772e-07, "epoch": 8.93658536585366, "percentage": 89.37, "elapsed_time": "2:23:52", "remaining_time": "0:17:07"} -{"current_steps": 1833, "total_steps": 2050, "loss": 0.0009, "lr": 1.3822033512691209e-07, "epoch": 8.941463414634146, "percentage": 89.41, "elapsed_time": "2:23:53", "remaining_time": "0:17:02"} -{"current_steps": 1834, "total_steps": 2050, "loss": 0.0028, "lr": 1.369668486954545e-07, "epoch": 8.946341463414633, "percentage": 89.46, "elapsed_time": "2:23:56", "remaining_time": "0:16:57"} -{"current_steps": 1835, "total_steps": 2050, "loss": 0.001, "lr": 1.3571891186490687e-07, "epoch": 8.951219512195122, "percentage": 89.51, "elapsed_time": "2:24:00", "remaining_time": "0:16:52"} -{"current_steps": 1836, "total_steps": 2050, "loss": 0.0024, "lr": 1.3447652756605894e-07, "epoch": 8.95609756097561, "percentage": 89.56, "elapsed_time": "2:24:05", "remaining_time": "0:16:47"} -{"current_steps": 1837, "total_steps": 2050, "loss": 0.0015, "lr": 1.3323969871665897e-07, "epoch": 8.960975609756098, "percentage": 89.61, "elapsed_time": "2:24:09", "remaining_time": "0:16:42"} -{"current_steps": 1838, "total_steps": 2050, "loss": 0.0007, "lr": 1.3200842822140818e-07, "epoch": 8.965853658536584, "percentage": 89.66, "elapsed_time": "2:24:13", "remaining_time": "0:16:38"} -{"current_steps": 1839, "total_steps": 2050, "loss": 0.0018, "lr": 1.3078271897195572e-07, "epoch": 8.970731707317073, "percentage": 89.71, "elapsed_time": "2:24:18", "remaining_time": "0:16:33"} -{"current_steps": 1840, "total_steps": 2050, "loss": 0.0063, "lr": 1.2956257384688807e-07, "epoch": 8.975609756097562, "percentage": 89.76, "elapsed_time": "2:24:24", "remaining_time": "0:16:28"} -{"current_steps": 1841, "total_steps": 2050, "loss": 0.002, "lr": 1.283479957117248e-07, "epoch": 8.980487804878049, "percentage": 89.8, "elapsed_time": "2:24:29", "remaining_time": "0:16:24"} -{"current_steps": 1842, "total_steps": 2050, "loss": 0.0398, "lr": 1.2713898741891244e-07, "epoch": 8.985365853658536, "percentage": 89.85, "elapsed_time": "2:24:36", "remaining_time": "0:16:19"} -{"current_steps": 1843, "total_steps": 2050, "loss": 0.0004, "lr": 1.2593555180781591e-07, "epoch": 8.990243902439024, "percentage": 89.9, "elapsed_time": "2:24:37", "remaining_time": "0:16:14"} -{"current_steps": 1844, "total_steps": 2050, "loss": 0.0713, "lr": 1.2473769170471188e-07, "epoch": 8.995121951219513, "percentage": 89.95, "elapsed_time": "2:24:42", "remaining_time": "0:16:09"} -{"current_steps": 1845, "total_steps": 2050, "loss": 0.002, "lr": 1.2354540992278452e-07, "epoch": 9.0, "percentage": 90.0, "elapsed_time": "2:24:45", "remaining_time": "0:16:05"} -{"current_steps": 1846, "total_steps": 2050, "loss": 0.0004, "lr": 1.223587092621162e-07, "epoch": 9.004878048780487, "percentage": 90.05, "elapsed_time": "2:27:53", "remaining_time": "0:16:20"} -{"current_steps": 1847, "total_steps": 2050, "loss": 0.0791, "lr": 1.2117759250968225e-07, "epoch": 9.009756097560976, "percentage": 90.1, "elapsed_time": "2:27:57", "remaining_time": "0:16:15"} -{"current_steps": 1848, "total_steps": 2050, "loss": 0.0021, "lr": 1.2000206243934358e-07, "epoch": 9.014634146341463, "percentage": 90.15, "elapsed_time": "2:27:59", "remaining_time": "0:16:10"} -{"current_steps": 1849, "total_steps": 2050, "loss": 0.0014, "lr": 1.1883212181184212e-07, "epoch": 9.019512195121951, "percentage": 90.2, "elapsed_time": "2:28:03", "remaining_time": "0:16:05"} -{"current_steps": 1850, "total_steps": 2050, "loss": 0.003, "lr": 1.176677733747919e-07, "epoch": 9.024390243902438, "percentage": 90.24, "elapsed_time": "2:28:09", "remaining_time": "0:16:01"} -{"current_steps": 1851, "total_steps": 2050, "loss": 0.0009, "lr": 1.1650901986267365e-07, "epoch": 9.029268292682927, "percentage": 90.29, "elapsed_time": "2:28:16", "remaining_time": "0:15:56"} -{"current_steps": 1852, "total_steps": 2050, "loss": 0.001, "lr": 1.1535586399682885e-07, "epoch": 9.034146341463414, "percentage": 90.34, "elapsed_time": "2:28:19", "remaining_time": "0:15:51"} -{"current_steps": 1853, "total_steps": 2050, "loss": 0.0008, "lr": 1.1420830848545256e-07, "epoch": 9.039024390243902, "percentage": 90.39, "elapsed_time": "2:28:25", "remaining_time": "0:15:46"} -{"current_steps": 1854, "total_steps": 2050, "loss": 0.0086, "lr": 1.1306635602358673e-07, "epoch": 9.04390243902439, "percentage": 90.44, "elapsed_time": "2:28:32", "remaining_time": "0:15:42"} -{"current_steps": 1855, "total_steps": 2050, "loss": 0.0023, "lr": 1.1193000929311638e-07, "epoch": 9.048780487804878, "percentage": 90.49, "elapsed_time": "2:28:37", "remaining_time": "0:15:37"} -{"current_steps": 1856, "total_steps": 2050, "loss": 0.0235, "lr": 1.1079927096275978e-07, "epoch": 9.053658536585365, "percentage": 90.54, "elapsed_time": "2:28:39", "remaining_time": "0:15:32"} -{"current_steps": 1857, "total_steps": 2050, "loss": 0.0013, "lr": 1.0967414368806384e-07, "epoch": 9.058536585365854, "percentage": 90.59, "elapsed_time": "2:28:43", "remaining_time": "0:15:27"} -{"current_steps": 1858, "total_steps": 2050, "loss": 0.0005, "lr": 1.0855463011139905e-07, "epoch": 9.06341463414634, "percentage": 90.63, "elapsed_time": "2:28:45", "remaining_time": "0:15:22"} -{"current_steps": 1859, "total_steps": 2050, "loss": 0.0011, "lr": 1.0744073286195089e-07, "epoch": 9.06829268292683, "percentage": 90.68, "elapsed_time": "2:28:47", "remaining_time": "0:15:17"} -{"current_steps": 1860, "total_steps": 2050, "loss": 0.0015, "lr": 1.0633245455571511e-07, "epoch": 9.073170731707316, "percentage": 90.73, "elapsed_time": "2:28:51", "remaining_time": "0:15:12"} -{"current_steps": 1861, "total_steps": 2050, "loss": 0.001, "lr": 1.052297977954922e-07, "epoch": 9.078048780487805, "percentage": 90.78, "elapsed_time": "2:28:55", "remaining_time": "0:15:07"} -{"current_steps": 1862, "total_steps": 2050, "loss": 0.0005, "lr": 1.0413276517087956e-07, "epoch": 9.082926829268292, "percentage": 90.83, "elapsed_time": "2:28:56", "remaining_time": "0:15:02"} -{"current_steps": 1863, "total_steps": 2050, "loss": 0.002, "lr": 1.0304135925826603e-07, "epoch": 9.08780487804878, "percentage": 90.88, "elapsed_time": "2:29:00", "remaining_time": "0:14:57"} -{"current_steps": 1864, "total_steps": 2050, "loss": 0.0052, "lr": 1.0195558262082683e-07, "epoch": 9.092682926829267, "percentage": 90.93, "elapsed_time": "2:29:03", "remaining_time": "0:14:52"} -{"current_steps": 1865, "total_steps": 2050, "loss": 0.0009, "lr": 1.0087543780851666e-07, "epoch": 9.097560975609756, "percentage": 90.98, "elapsed_time": "2:29:05", "remaining_time": "0:14:47"} -{"current_steps": 1866, "total_steps": 2050, "loss": 0.0017, "lr": 9.98009273580633e-08, "epoch": 9.102439024390243, "percentage": 91.02, "elapsed_time": "2:29:09", "remaining_time": "0:14:42"} -{"current_steps": 1867, "total_steps": 2050, "loss": 0.0006, "lr": 9.87320537929623e-08, "epoch": 9.107317073170732, "percentage": 91.07, "elapsed_time": "2:29:10", "remaining_time": "0:14:37"} -{"current_steps": 1868, "total_steps": 2050, "loss": 0.0006, "lr": 9.766881962347208e-08, "epoch": 9.112195121951219, "percentage": 91.12, "elapsed_time": "2:29:13", "remaining_time": "0:14:32"} -{"current_steps": 1869, "total_steps": 2050, "loss": 0.0017, "lr": 9.661122734660521e-08, "epoch": 9.117073170731707, "percentage": 91.17, "elapsed_time": "2:29:15", "remaining_time": "0:14:27"} -{"current_steps": 1870, "total_steps": 2050, "loss": 0.0015, "lr": 9.555927944612492e-08, "epoch": 9.121951219512194, "percentage": 91.22, "elapsed_time": "2:29:17", "remaining_time": "0:14:22"} -{"current_steps": 1871, "total_steps": 2050, "loss": 0.0179, "lr": 9.451297839253915e-08, "epoch": 9.126829268292683, "percentage": 91.27, "elapsed_time": "2:29:24", "remaining_time": "0:14:17"} -{"current_steps": 1872, "total_steps": 2050, "loss": 0.0013, "lr": 9.34723266430937e-08, "epoch": 9.13170731707317, "percentage": 91.32, "elapsed_time": "2:29:27", "remaining_time": "0:14:12"} -{"current_steps": 1873, "total_steps": 2050, "loss": 0.0013, "lr": 9.243732664176636e-08, "epoch": 9.136585365853659, "percentage": 91.37, "elapsed_time": "2:29:28", "remaining_time": "0:14:07"} -{"current_steps": 1874, "total_steps": 2050, "loss": 0.0042, "lr": 9.140798081926277e-08, "epoch": 9.141463414634146, "percentage": 91.41, "elapsed_time": "2:29:33", "remaining_time": "0:14:02"} -{"current_steps": 1875, "total_steps": 2050, "loss": 0.0014, "lr": 9.03842915930095e-08, "epoch": 9.146341463414634, "percentage": 91.46, "elapsed_time": "2:29:37", "remaining_time": "0:13:57"} -{"current_steps": 1876, "total_steps": 2050, "loss": 0.0014, "lr": 8.936626136714754e-08, "epoch": 9.151219512195121, "percentage": 91.51, "elapsed_time": "2:29:41", "remaining_time": "0:13:53"} -{"current_steps": 1877, "total_steps": 2050, "loss": 0.0036, "lr": 8.835389253252918e-08, "epoch": 9.15609756097561, "percentage": 91.56, "elapsed_time": "2:29:46", "remaining_time": "0:13:48"} -{"current_steps": 1878, "total_steps": 2050, "loss": 0.0195, "lr": 8.734718746670978e-08, "epoch": 9.160975609756097, "percentage": 91.61, "elapsed_time": "2:29:54", "remaining_time": "0:13:43"} -{"current_steps": 1879, "total_steps": 2050, "loss": 0.0009, "lr": 8.634614853394341e-08, "epoch": 9.165853658536586, "percentage": 91.66, "elapsed_time": "2:29:59", "remaining_time": "0:13:39"} -{"current_steps": 1880, "total_steps": 2050, "loss": 0.0028, "lr": 8.53507780851781e-08, "epoch": 9.170731707317072, "percentage": 91.71, "elapsed_time": "2:30:01", "remaining_time": "0:13:33"} -{"current_steps": 1881, "total_steps": 2050, "loss": 0.0023, "lr": 8.436107845804842e-08, "epoch": 9.175609756097561, "percentage": 91.76, "elapsed_time": "2:30:03", "remaining_time": "0:13:28"} -{"current_steps": 1882, "total_steps": 2050, "loss": 0.0008, "lr": 8.33770519768709e-08, "epoch": 9.180487804878048, "percentage": 91.8, "elapsed_time": "2:30:08", "remaining_time": "0:13:24"} -{"current_steps": 1883, "total_steps": 2050, "loss": 0.0018, "lr": 8.239870095263974e-08, "epoch": 9.185365853658537, "percentage": 91.85, "elapsed_time": "2:30:10", "remaining_time": "0:13:19"} -{"current_steps": 1884, "total_steps": 2050, "loss": 0.0016, "lr": 8.142602768301921e-08, "epoch": 9.190243902439024, "percentage": 91.9, "elapsed_time": "2:30:12", "remaining_time": "0:13:14"} -{"current_steps": 1885, "total_steps": 2050, "loss": 0.0063, "lr": 8.045903445233982e-08, "epoch": 9.195121951219512, "percentage": 91.95, "elapsed_time": "2:30:17", "remaining_time": "0:13:09"} -{"current_steps": 1886, "total_steps": 2050, "loss": 0.0049, "lr": 7.949772353159191e-08, "epoch": 9.2, "percentage": 92.0, "elapsed_time": "2:30:22", "remaining_time": "0:13:04"} -{"current_steps": 1887, "total_steps": 2050, "loss": 0.0006, "lr": 7.854209717842231e-08, "epoch": 9.204878048780488, "percentage": 92.05, "elapsed_time": "2:30:24", "remaining_time": "0:12:59"} -{"current_steps": 1888, "total_steps": 2050, "loss": 0.0007, "lr": 7.759215763712579e-08, "epoch": 9.209756097560975, "percentage": 92.1, "elapsed_time": "2:30:28", "remaining_time": "0:12:54"} -{"current_steps": 1889, "total_steps": 2050, "loss": 0.0038, "lr": 7.664790713864223e-08, "epoch": 9.214634146341464, "percentage": 92.15, "elapsed_time": "2:30:34", "remaining_time": "0:12:50"} -{"current_steps": 1890, "total_steps": 2050, "loss": 0.0059, "lr": 7.57093479005519e-08, "epoch": 9.21951219512195, "percentage": 92.2, "elapsed_time": "2:30:37", "remaining_time": "0:12:45"} -{"current_steps": 1891, "total_steps": 2050, "loss": 0.0009, "lr": 7.477648212706746e-08, "epoch": 9.22439024390244, "percentage": 92.24, "elapsed_time": "2:30:39", "remaining_time": "0:12:40"} -{"current_steps": 1892, "total_steps": 2050, "loss": 0.0019, "lr": 7.384931200903084e-08, "epoch": 9.229268292682926, "percentage": 92.29, "elapsed_time": "2:30:46", "remaining_time": "0:12:35"} -{"current_steps": 1893, "total_steps": 2050, "loss": 0.0036, "lr": 7.29278397239086e-08, "epoch": 9.234146341463415, "percentage": 92.34, "elapsed_time": "2:30:49", "remaining_time": "0:12:30"} -{"current_steps": 1894, "total_steps": 2050, "loss": 0.0005, "lr": 7.20120674357852e-08, "epoch": 9.239024390243902, "percentage": 92.39, "elapsed_time": "2:30:51", "remaining_time": "0:12:25"} -{"current_steps": 1895, "total_steps": 2050, "loss": 0.0009, "lr": 7.110199729535805e-08, "epoch": 9.24390243902439, "percentage": 92.44, "elapsed_time": "2:30:55", "remaining_time": "0:12:20"} -{"current_steps": 1896, "total_steps": 2050, "loss": 0.0047, "lr": 7.019763143993441e-08, "epoch": 9.248780487804877, "percentage": 92.49, "elapsed_time": "2:31:01", "remaining_time": "0:12:16"} -{"current_steps": 1897, "total_steps": 2050, "loss": 0.0006, "lr": 6.929897199342395e-08, "epoch": 9.253658536585366, "percentage": 92.54, "elapsed_time": "2:31:04", "remaining_time": "0:12:11"} -{"current_steps": 1898, "total_steps": 2050, "loss": 0.0005, "lr": 6.840602106633425e-08, "epoch": 9.258536585365853, "percentage": 92.59, "elapsed_time": "2:31:05", "remaining_time": "0:12:05"} -{"current_steps": 1899, "total_steps": 2050, "loss": 0.001, "lr": 6.751878075576867e-08, "epoch": 9.263414634146342, "percentage": 92.63, "elapsed_time": "2:31:07", "remaining_time": "0:12:01"} -{"current_steps": 1900, "total_steps": 2050, "loss": 0.0046, "lr": 6.663725314541652e-08, "epoch": 9.268292682926829, "percentage": 92.68, "elapsed_time": "2:31:09", "remaining_time": "0:11:56"} -{"current_steps": 1901, "total_steps": 2050, "loss": 0.0009, "lr": 6.576144030555259e-08, "epoch": 9.273170731707317, "percentage": 92.73, "elapsed_time": "2:31:13", "remaining_time": "0:11:51"} -{"current_steps": 1902, "total_steps": 2050, "loss": 0.0038, "lr": 6.489134429302906e-08, "epoch": 9.278048780487804, "percentage": 92.78, "elapsed_time": "2:31:15", "remaining_time": "0:11:46"} -{"current_steps": 1903, "total_steps": 2050, "loss": 0.0007, "lr": 6.402696715127387e-08, "epoch": 9.282926829268293, "percentage": 92.83, "elapsed_time": "2:31:22", "remaining_time": "0:11:41"} -{"current_steps": 1904, "total_steps": 2050, "loss": 0.0005, "lr": 6.316831091028237e-08, "epoch": 9.28780487804878, "percentage": 92.88, "elapsed_time": "2:31:26", "remaining_time": "0:11:36"} -{"current_steps": 1905, "total_steps": 2050, "loss": 0.0004, "lr": 6.23153775866156e-08, "epoch": 9.292682926829269, "percentage": 92.93, "elapsed_time": "2:31:29", "remaining_time": "0:11:31"} -{"current_steps": 1906, "total_steps": 2050, "loss": 0.0027, "lr": 6.14681691833935e-08, "epoch": 9.297560975609755, "percentage": 92.98, "elapsed_time": "2:31:32", "remaining_time": "0:11:26"} -{"current_steps": 1907, "total_steps": 2050, "loss": 0.0006, "lr": 6.062668769029168e-08, "epoch": 9.302439024390244, "percentage": 93.02, "elapsed_time": "2:31:34", "remaining_time": "0:11:21"} -{"current_steps": 1908, "total_steps": 2050, "loss": 0.0033, "lr": 5.979093508353489e-08, "epoch": 9.307317073170731, "percentage": 93.07, "elapsed_time": "2:31:39", "remaining_time": "0:11:17"} -{"current_steps": 1909, "total_steps": 2050, "loss": 0.0143, "lr": 5.896091332589532e-08, "epoch": 9.31219512195122, "percentage": 93.12, "elapsed_time": "2:31:45", "remaining_time": "0:11:12"} -{"current_steps": 1910, "total_steps": 2050, "loss": 0.0006, "lr": 5.813662436668477e-08, "epoch": 9.317073170731707, "percentage": 93.17, "elapsed_time": "2:31:47", "remaining_time": "0:11:07"} -{"current_steps": 1911, "total_steps": 2050, "loss": 0.0009, "lr": 5.731807014175195e-08, "epoch": 9.321951219512195, "percentage": 93.22, "elapsed_time": "2:31:49", "remaining_time": "0:11:02"} -{"current_steps": 1912, "total_steps": 2050, "loss": 0.0004, "lr": 5.650525257347744e-08, "epoch": 9.326829268292682, "percentage": 93.27, "elapsed_time": "2:31:51", "remaining_time": "0:10:57"} -{"current_steps": 1913, "total_steps": 2050, "loss": 0.0012, "lr": 5.569817357076984e-08, "epoch": 9.331707317073171, "percentage": 93.32, "elapsed_time": "2:31:58", "remaining_time": "0:10:53"} -{"current_steps": 1914, "total_steps": 2050, "loss": 0.0039, "lr": 5.489683502905935e-08, "epoch": 9.336585365853658, "percentage": 93.37, "elapsed_time": "2:32:00", "remaining_time": "0:10:48"} -{"current_steps": 1915, "total_steps": 2050, "loss": 0.001, "lr": 5.410123883029639e-08, "epoch": 9.341463414634147, "percentage": 93.41, "elapsed_time": "2:32:06", "remaining_time": "0:10:43"} -{"current_steps": 1916, "total_steps": 2050, "loss": 0.0017, "lr": 5.3311386842944125e-08, "epoch": 9.346341463414634, "percentage": 93.46, "elapsed_time": "2:32:08", "remaining_time": "0:10:38"} -{"current_steps": 1917, "total_steps": 2050, "loss": 0.0021, "lr": 5.25272809219754e-08, "epoch": 9.351219512195122, "percentage": 93.51, "elapsed_time": "2:32:12", "remaining_time": "0:10:33"} -{"current_steps": 1918, "total_steps": 2050, "loss": 0.001, "lr": 5.17489229088694e-08, "epoch": 9.35609756097561, "percentage": 93.56, "elapsed_time": "2:32:14", "remaining_time": "0:10:28"} -{"current_steps": 1919, "total_steps": 2050, "loss": 0.002, "lr": 5.097631463160585e-08, "epoch": 9.360975609756098, "percentage": 93.61, "elapsed_time": "2:32:16", "remaining_time": "0:10:23"} -{"current_steps": 1920, "total_steps": 2050, "loss": 0.0007, "lr": 5.020945790466025e-08, "epoch": 9.365853658536585, "percentage": 93.66, "elapsed_time": "2:32:18", "remaining_time": "0:10:18"} -{"current_steps": 1921, "total_steps": 2050, "loss": 0.0005, "lr": 4.944835452900199e-08, "epoch": 9.370731707317074, "percentage": 93.71, "elapsed_time": "2:32:22", "remaining_time": "0:10:13"} -{"current_steps": 1922, "total_steps": 2050, "loss": 0.0084, "lr": 4.869300629208762e-08, "epoch": 9.37560975609756, "percentage": 93.76, "elapsed_time": "2:32:25", "remaining_time": "0:10:09"} -{"current_steps": 1923, "total_steps": 2050, "loss": 0.0024, "lr": 4.7943414967858426e-08, "epoch": 9.38048780487805, "percentage": 93.8, "elapsed_time": "2:32:28", "remaining_time": "0:10:04"} -{"current_steps": 1924, "total_steps": 2050, "loss": 0.0006, "lr": 4.7199582316734827e-08, "epoch": 9.385365853658536, "percentage": 93.85, "elapsed_time": "2:32:31", "remaining_time": "0:09:59"} -{"current_steps": 1925, "total_steps": 2050, "loss": 0.0012, "lr": 4.6461510085613616e-08, "epoch": 9.390243902439025, "percentage": 93.9, "elapsed_time": "2:32:33", "remaining_time": "0:09:54"} -{"current_steps": 1926, "total_steps": 2050, "loss": 0.0004, "lr": 4.5729200007862686e-08, "epoch": 9.395121951219512, "percentage": 93.95, "elapsed_time": "2:32:35", "remaining_time": "0:09:49"} -{"current_steps": 1927, "total_steps": 2050, "loss": 0.0016, "lr": 4.5002653803317975e-08, "epoch": 9.4, "percentage": 94.0, "elapsed_time": "2:32:41", "remaining_time": "0:09:44"} -{"current_steps": 1928, "total_steps": 2050, "loss": 0.001, "lr": 4.428187317827848e-08, "epoch": 9.404878048780487, "percentage": 94.05, "elapsed_time": "2:32:42", "remaining_time": "0:09:39"} -{"current_steps": 1929, "total_steps": 2050, "loss": 0.0016, "lr": 4.356685982550263e-08, "epoch": 9.409756097560976, "percentage": 94.1, "elapsed_time": "2:32:44", "remaining_time": "0:09:34"} -{"current_steps": 1930, "total_steps": 2050, "loss": 0.0004, "lr": 4.285761542420497e-08, "epoch": 9.414634146341463, "percentage": 94.15, "elapsed_time": "2:32:46", "remaining_time": "0:09:29"} -{"current_steps": 1931, "total_steps": 2050, "loss": 0.0007, "lr": 4.215414164005116e-08, "epoch": 9.419512195121952, "percentage": 94.2, "elapsed_time": "2:32:49", "remaining_time": "0:09:25"} -{"current_steps": 1932, "total_steps": 2050, "loss": 0.0012, "lr": 4.145644012515465e-08, "epoch": 9.424390243902439, "percentage": 94.24, "elapsed_time": "2:32:55", "remaining_time": "0:09:20"} -{"current_steps": 1933, "total_steps": 2050, "loss": 0.0024, "lr": 4.076451251807223e-08, "epoch": 9.429268292682927, "percentage": 94.29, "elapsed_time": "2:33:01", "remaining_time": "0:09:15"} -{"current_steps": 1934, "total_steps": 2050, "loss": 0.0005, "lr": 4.0078360443801535e-08, "epoch": 9.434146341463414, "percentage": 94.34, "elapsed_time": "2:33:05", "remaining_time": "0:09:10"} -{"current_steps": 1935, "total_steps": 2050, "loss": 0.0007, "lr": 3.9397985513775495e-08, "epoch": 9.439024390243903, "percentage": 94.39, "elapsed_time": "2:33:07", "remaining_time": "0:09:06"} -{"current_steps": 1936, "total_steps": 2050, "loss": 0.0006, "lr": 3.872338932585984e-08, "epoch": 9.44390243902439, "percentage": 94.44, "elapsed_time": "2:33:11", "remaining_time": "0:09:01"} -{"current_steps": 1937, "total_steps": 2050, "loss": 0.0012, "lr": 3.8054573464348655e-08, "epoch": 9.448780487804878, "percentage": 94.49, "elapsed_time": "2:33:16", "remaining_time": "0:08:56"} -{"current_steps": 1938, "total_steps": 2050, "loss": 0.0004, "lr": 3.739153949996105e-08, "epoch": 9.453658536585365, "percentage": 94.54, "elapsed_time": "2:33:18", "remaining_time": "0:08:51"} -{"current_steps": 1939, "total_steps": 2050, "loss": 0.0018, "lr": 3.6734288989836994e-08, "epoch": 9.458536585365854, "percentage": 94.59, "elapsed_time": "2:33:25", "remaining_time": "0:08:46"} -{"current_steps": 1940, "total_steps": 2050, "loss": 0.0005, "lr": 3.608282347753428e-08, "epoch": 9.463414634146341, "percentage": 94.63, "elapsed_time": "2:33:27", "remaining_time": "0:08:42"} -{"current_steps": 1941, "total_steps": 2050, "loss": 0.0014, "lr": 3.543714449302488e-08, "epoch": 9.46829268292683, "percentage": 94.68, "elapsed_time": "2:33:34", "remaining_time": "0:08:37"} -{"current_steps": 1942, "total_steps": 2050, "loss": 0.0011, "lr": 3.479725355268998e-08, "epoch": 9.473170731707317, "percentage": 94.73, "elapsed_time": "2:33:38", "remaining_time": "0:08:32"} -{"current_steps": 1943, "total_steps": 2050, "loss": 0.0019, "lr": 3.4163152159318866e-08, "epoch": 9.478048780487805, "percentage": 94.78, "elapsed_time": "2:33:41", "remaining_time": "0:08:27"} -{"current_steps": 1944, "total_steps": 2050, "loss": 0.0003, "lr": 3.353484180210337e-08, "epoch": 9.482926829268292, "percentage": 94.83, "elapsed_time": "2:33:43", "remaining_time": "0:08:22"} -{"current_steps": 1945, "total_steps": 2050, "loss": 0.0057, "lr": 3.291232395663424e-08, "epoch": 9.487804878048781, "percentage": 94.88, "elapsed_time": "2:33:47", "remaining_time": "0:08:18"} -{"current_steps": 1946, "total_steps": 2050, "loss": 0.0021, "lr": 3.229560008490007e-08, "epoch": 9.492682926829268, "percentage": 94.93, "elapsed_time": "2:33:51", "remaining_time": "0:08:13"} -{"current_steps": 1947, "total_steps": 2050, "loss": 0.0079, "lr": 3.168467163528116e-08, "epoch": 9.497560975609757, "percentage": 94.98, "elapsed_time": "2:33:53", "remaining_time": "0:08:08"} -{"current_steps": 1948, "total_steps": 2050, "loss": 0.0077, "lr": 3.1079540042547315e-08, "epoch": 9.502439024390243, "percentage": 95.02, "elapsed_time": "2:33:56", "remaining_time": "0:08:03"} -{"current_steps": 1949, "total_steps": 2050, "loss": 0.0004, "lr": 3.0480206727855066e-08, "epoch": 9.507317073170732, "percentage": 95.07, "elapsed_time": "2:33:58", "remaining_time": "0:07:58"} -{"current_steps": 1950, "total_steps": 2050, "loss": 0.0015, "lr": 2.988667309874294e-08, "epoch": 9.512195121951219, "percentage": 95.12, "elapsed_time": "2:34:02", "remaining_time": "0:07:53"} -{"current_steps": 1951, "total_steps": 2050, "loss": 0.0132, "lr": 2.9298940549128962e-08, "epoch": 9.517073170731708, "percentage": 95.17, "elapsed_time": "2:34:05", "remaining_time": "0:07:49"} -{"current_steps": 1952, "total_steps": 2050, "loss": 0.0019, "lr": 2.871701045930708e-08, "epoch": 9.521951219512195, "percentage": 95.22, "elapsed_time": "2:34:07", "remaining_time": "0:07:44"} -{"current_steps": 1953, "total_steps": 2050, "loss": 0.0024, "lr": 2.8140884195945184e-08, "epoch": 9.526829268292683, "percentage": 95.27, "elapsed_time": "2:34:09", "remaining_time": "0:07:39"} -{"current_steps": 1954, "total_steps": 2050, "loss": 0.001, "lr": 2.7570563112079564e-08, "epoch": 9.53170731707317, "percentage": 95.32, "elapsed_time": "2:34:16", "remaining_time": "0:07:34"} -{"current_steps": 1955, "total_steps": 2050, "loss": 0.0013, "lr": 2.700604854711353e-08, "epoch": 9.536585365853659, "percentage": 95.37, "elapsed_time": "2:34:22", "remaining_time": "0:07:30"} -{"current_steps": 1956, "total_steps": 2050, "loss": 0.0005, "lr": 2.6447341826814077e-08, "epoch": 9.541463414634146, "percentage": 95.41, "elapsed_time": "2:34:23", "remaining_time": "0:07:25"} -{"current_steps": 1957, "total_steps": 2050, "loss": 0.001, "lr": 2.5894444263307728e-08, "epoch": 9.546341463414635, "percentage": 95.46, "elapsed_time": "2:34:25", "remaining_time": "0:07:20"} -{"current_steps": 1958, "total_steps": 2050, "loss": 0.0103, "lr": 2.5347357155078577e-08, "epoch": 9.551219512195122, "percentage": 95.51, "elapsed_time": "2:34:27", "remaining_time": "0:07:15"} -{"current_steps": 1959, "total_steps": 2050, "loss": 0.0006, "lr": 2.4806081786964974e-08, "epoch": 9.55609756097561, "percentage": 95.56, "elapsed_time": "2:34:30", "remaining_time": "0:07:10"} -{"current_steps": 1960, "total_steps": 2050, "loss": 0.0019, "lr": 2.4270619430156183e-08, "epoch": 9.560975609756097, "percentage": 95.61, "elapsed_time": "2:34:32", "remaining_time": "0:07:05"} -{"current_steps": 1961, "total_steps": 2050, "loss": 0.0027, "lr": 2.3740971342189056e-08, "epoch": 9.565853658536586, "percentage": 95.66, "elapsed_time": "2:34:36", "remaining_time": "0:07:01"} -{"current_steps": 1962, "total_steps": 2050, "loss": 0.0133, "lr": 2.321713876694637e-08, "epoch": 9.570731707317073, "percentage": 95.71, "elapsed_time": "2:34:43", "remaining_time": "0:06:56"} -{"current_steps": 1963, "total_steps": 2050, "loss": 0.008, "lr": 2.269912293465293e-08, "epoch": 9.575609756097561, "percentage": 95.76, "elapsed_time": "2:34:45", "remaining_time": "0:06:51"} -{"current_steps": 1964, "total_steps": 2050, "loss": 0.0025, "lr": 2.2186925061872532e-08, "epoch": 9.580487804878048, "percentage": 95.8, "elapsed_time": "2:34:48", "remaining_time": "0:06:46"} -{"current_steps": 1965, "total_steps": 2050, "loss": 0.003, "lr": 2.1680546351506016e-08, "epoch": 9.585365853658537, "percentage": 95.85, "elapsed_time": "2:34:50", "remaining_time": "0:06:41"} -{"current_steps": 1966, "total_steps": 2050, "loss": 0.0036, "lr": 2.117998799278709e-08, "epoch": 9.590243902439024, "percentage": 95.9, "elapsed_time": "2:34:56", "remaining_time": "0:06:37"} -{"current_steps": 1967, "total_steps": 2050, "loss": 0.0006, "lr": 2.068525116128095e-08, "epoch": 9.595121951219513, "percentage": 95.95, "elapsed_time": "2:34:59", "remaining_time": "0:06:32"} -{"current_steps": 1968, "total_steps": 2050, "loss": 0.0659, "lr": 2.0196337018880962e-08, "epoch": 9.6, "percentage": 96.0, "elapsed_time": "2:35:02", "remaining_time": "0:06:27"} -{"current_steps": 1969, "total_steps": 2050, "loss": 0.0003, "lr": 1.9713246713805588e-08, "epoch": 9.604878048780488, "percentage": 96.05, "elapsed_time": "2:35:03", "remaining_time": "0:06:22"} -{"current_steps": 1970, "total_steps": 2050, "loss": 0.0013, "lr": 1.9235981380595625e-08, "epoch": 9.609756097560975, "percentage": 96.1, "elapsed_time": "2:35:06", "remaining_time": "0:06:17"} -{"current_steps": 1971, "total_steps": 2050, "loss": 0.0004, "lr": 1.876454214011253e-08, "epoch": 9.614634146341464, "percentage": 96.15, "elapsed_time": "2:35:08", "remaining_time": "0:06:13"} -{"current_steps": 1972, "total_steps": 2050, "loss": 0.001, "lr": 1.8298930099534817e-08, "epoch": 9.61951219512195, "percentage": 96.2, "elapsed_time": "2:35:12", "remaining_time": "0:06:08"} -{"current_steps": 1973, "total_steps": 2050, "loss": 0.0006, "lr": 1.783914635235584e-08, "epoch": 9.62439024390244, "percentage": 96.24, "elapsed_time": "2:35:16", "remaining_time": "0:06:03"} -{"current_steps": 1974, "total_steps": 2050, "loss": 0.0038, "lr": 1.738519197838101e-08, "epoch": 9.629268292682926, "percentage": 96.29, "elapsed_time": "2:35:21", "remaining_time": "0:05:58"} -{"current_steps": 1975, "total_steps": 2050, "loss": 0.0006, "lr": 1.6937068043725856e-08, "epoch": 9.634146341463415, "percentage": 96.34, "elapsed_time": "2:35:23", "remaining_time": "0:05:54"} -{"current_steps": 1976, "total_steps": 2050, "loss": 0.0026, "lr": 1.6494775600812418e-08, "epoch": 9.639024390243902, "percentage": 96.39, "elapsed_time": "2:35:26", "remaining_time": "0:05:49"} -{"current_steps": 1977, "total_steps": 2050, "loss": 0.003, "lr": 1.6058315688367852e-08, "epoch": 9.64390243902439, "percentage": 96.44, "elapsed_time": "2:35:29", "remaining_time": "0:05:44"} -{"current_steps": 1978, "total_steps": 2050, "loss": 0.0015, "lr": 1.5627689331421946e-08, "epoch": 9.648780487804878, "percentage": 96.49, "elapsed_time": "2:35:34", "remaining_time": "0:05:39"} -{"current_steps": 1979, "total_steps": 2050, "loss": 0.001, "lr": 1.520289754130322e-08, "epoch": 9.653658536585366, "percentage": 96.54, "elapsed_time": "2:35:37", "remaining_time": "0:05:35"} -{"current_steps": 1980, "total_steps": 2050, "loss": 0.0011, "lr": 1.478394131563865e-08, "epoch": 9.658536585365853, "percentage": 96.59, "elapsed_time": "2:35:40", "remaining_time": "0:05:30"} -{"current_steps": 1981, "total_steps": 2050, "loss": 0.0022, "lr": 1.4370821638350353e-08, "epoch": 9.663414634146342, "percentage": 96.63, "elapsed_time": "2:35:43", "remaining_time": "0:05:25"} -{"current_steps": 1982, "total_steps": 2050, "loss": 0.0006, "lr": 1.396353947965251e-08, "epoch": 9.668292682926829, "percentage": 96.68, "elapsed_time": "2:35:46", "remaining_time": "0:05:20"} -{"current_steps": 1983, "total_steps": 2050, "loss": 0.0003, "lr": 1.3562095796050279e-08, "epoch": 9.673170731707318, "percentage": 96.73, "elapsed_time": "2:35:47", "remaining_time": "0:05:15"} -{"current_steps": 1984, "total_steps": 2050, "loss": 0.001, "lr": 1.3166491530337555e-08, "epoch": 9.678048780487805, "percentage": 96.78, "elapsed_time": "2:35:48", "remaining_time": "0:05:10"} -{"current_steps": 1985, "total_steps": 2050, "loss": 0.0007, "lr": 1.2776727611593653e-08, "epoch": 9.682926829268293, "percentage": 96.83, "elapsed_time": "2:35:52", "remaining_time": "0:05:06"} -{"current_steps": 1986, "total_steps": 2050, "loss": 0.002, "lr": 1.2392804955181915e-08, "epoch": 9.68780487804878, "percentage": 96.88, "elapsed_time": "2:35:53", "remaining_time": "0:05:01"} -{"current_steps": 1987, "total_steps": 2050, "loss": 0.0016, "lr": 1.2014724462747763e-08, "epoch": 9.692682926829269, "percentage": 96.93, "elapsed_time": "2:35:57", "remaining_time": "0:04:56"} -{"current_steps": 1988, "total_steps": 2050, "loss": 0.0008, "lr": 1.1642487022215931e-08, "epoch": 9.697560975609756, "percentage": 96.98, "elapsed_time": "2:36:00", "remaining_time": "0:04:51"} -{"current_steps": 1989, "total_steps": 2050, "loss": 0.001, "lr": 1.1276093507788798e-08, "epoch": 9.702439024390245, "percentage": 97.02, "elapsed_time": "2:36:02", "remaining_time": "0:04:47"} -{"current_steps": 1990, "total_steps": 2050, "loss": 0.0022, "lr": 1.0915544779944164e-08, "epoch": 9.707317073170731, "percentage": 97.07, "elapsed_time": "2:36:08", "remaining_time": "0:04:42"} -{"current_steps": 1991, "total_steps": 2050, "loss": 0.0008, "lr": 1.0560841685433864e-08, "epoch": 9.71219512195122, "percentage": 97.12, "elapsed_time": "2:36:14", "remaining_time": "0:04:37"} -{"current_steps": 1992, "total_steps": 2050, "loss": 0.0011, "lr": 1.021198505728016e-08, "epoch": 9.717073170731707, "percentage": 97.17, "elapsed_time": "2:36:16", "remaining_time": "0:04:33"} -{"current_steps": 1993, "total_steps": 2050, "loss": 0.0012, "lr": 9.868975714775741e-09, "epoch": 9.721951219512196, "percentage": 97.22, "elapsed_time": "2:36:22", "remaining_time": "0:04:28"} -{"current_steps": 1994, "total_steps": 2050, "loss": 0.0008, "lr": 9.531814463480394e-09, "epoch": 9.726829268292683, "percentage": 97.27, "elapsed_time": "2:36:24", "remaining_time": "0:04:23"} -{"current_steps": 1995, "total_steps": 2050, "loss": 0.0034, "lr": 9.200502095220166e-09, "epoch": 9.731707317073171, "percentage": 97.32, "elapsed_time": "2:36:27", "remaining_time": "0:04:18"} -{"current_steps": 1996, "total_steps": 2050, "loss": 0.0008, "lr": 8.875039388084317e-09, "epoch": 9.736585365853658, "percentage": 97.37, "elapsed_time": "2:36:31", "remaining_time": "0:04:14"} -{"current_steps": 1997, "total_steps": 2050, "loss": 0.0005, "lr": 8.555427106424485e-09, "epoch": 9.741463414634147, "percentage": 97.41, "elapsed_time": "2:36:34", "remaining_time": "0:04:09"} -{"current_steps": 1998, "total_steps": 2050, "loss": 0.0314, "lr": 8.241666000852466e-09, "epoch": 9.746341463414634, "percentage": 97.46, "elapsed_time": "2:36:39", "remaining_time": "0:04:04"} -{"current_steps": 1999, "total_steps": 2050, "loss": 0.0004, "lr": 7.933756808238823e-09, "epoch": 9.751219512195123, "percentage": 97.51, "elapsed_time": "2:36:40", "remaining_time": "0:03:59"} -{"current_steps": 2000, "total_steps": 2050, "loss": 0.0022, "lr": 7.631700251710116e-09, "epoch": 9.75609756097561, "percentage": 97.56, "elapsed_time": "2:36:43", "remaining_time": "0:03:55"} -{"current_steps": 2001, "total_steps": 2050, "loss": 0.0024, "lr": 7.335497040648898e-09, "epoch": 9.760975609756098, "percentage": 97.61, "elapsed_time": "2:36:44", "remaining_time": "0:03:50"} -{"current_steps": 2002, "total_steps": 2050, "loss": 0.0796, "lr": 7.045147870690105e-09, "epoch": 9.765853658536585, "percentage": 97.66, "elapsed_time": "2:36:47", "remaining_time": "0:03:45"} -{"current_steps": 2003, "total_steps": 2050, "loss": 0.0012, "lr": 6.760653423721619e-09, "epoch": 9.770731707317074, "percentage": 97.71, "elapsed_time": "2:36:48", "remaining_time": "0:03:40"} -{"current_steps": 2004, "total_steps": 2050, "loss": 0.0059, "lr": 6.4820143678800964e-09, "epoch": 9.77560975609756, "percentage": 97.76, "elapsed_time": "2:36:51", "remaining_time": "0:03:36"} -{"current_steps": 2005, "total_steps": 2050, "loss": 0.0003, "lr": 6.209231357551526e-09, "epoch": 9.78048780487805, "percentage": 97.8, "elapsed_time": "2:36:52", "remaining_time": "0:03:31"} -{"current_steps": 2006, "total_steps": 2050, "loss": 0.0113, "lr": 5.942305033369289e-09, "epoch": 9.785365853658536, "percentage": 97.85, "elapsed_time": "2:36:55", "remaining_time": "0:03:26"} -{"current_steps": 2007, "total_steps": 2050, "loss": 0.0003, "lr": 5.681236022211378e-09, "epoch": 9.790243902439025, "percentage": 97.9, "elapsed_time": "2:36:58", "remaining_time": "0:03:21"} -{"current_steps": 2008, "total_steps": 2050, "loss": 0.0021, "lr": 5.426024937200402e-09, "epoch": 9.795121951219512, "percentage": 97.95, "elapsed_time": "2:37:05", "remaining_time": "0:03:17"} -{"current_steps": 2009, "total_steps": 2050, "loss": 0.0004, "lr": 5.176672377701364e-09, "epoch": 9.8, "percentage": 98.0, "elapsed_time": "2:37:08", "remaining_time": "0:03:12"} -{"current_steps": 2010, "total_steps": 2050, "loss": 0.0006, "lr": 4.933178929321103e-09, "epoch": 9.804878048780488, "percentage": 98.05, "elapsed_time": "2:37:13", "remaining_time": "0:03:07"} -{"current_steps": 2011, "total_steps": 2050, "loss": 0.0047, "lr": 4.695545163905524e-09, "epoch": 9.809756097560976, "percentage": 98.1, "elapsed_time": "2:37:19", "remaining_time": "0:03:03"} -{"current_steps": 2012, "total_steps": 2050, "loss": 0.0013, "lr": 4.463771639539038e-09, "epoch": 9.814634146341463, "percentage": 98.15, "elapsed_time": "2:37:21", "remaining_time": "0:02:58"} -{"current_steps": 2013, "total_steps": 2050, "loss": 0.0025, "lr": 4.237858900543734e-09, "epoch": 9.819512195121952, "percentage": 98.2, "elapsed_time": "2:37:27", "remaining_time": "0:02:53"} -{"current_steps": 2014, "total_steps": 2050, "loss": 0.0045, "lr": 4.017807477477154e-09, "epoch": 9.824390243902439, "percentage": 98.24, "elapsed_time": "2:37:29", "remaining_time": "0:02:48"} -{"current_steps": 2015, "total_steps": 2050, "loss": 0.0017, "lr": 3.803617887132016e-09, "epoch": 9.829268292682928, "percentage": 98.29, "elapsed_time": "2:37:33", "remaining_time": "0:02:44"} -{"current_steps": 2016, "total_steps": 2050, "loss": 0.0043, "lr": 3.5952906325339988e-09, "epoch": 9.834146341463414, "percentage": 98.34, "elapsed_time": "2:37:39", "remaining_time": "0:02:39"} -{"current_steps": 2017, "total_steps": 2050, "loss": 0.0025, "lr": 3.3928262029411794e-09, "epoch": 9.839024390243903, "percentage": 98.39, "elapsed_time": "2:37:42", "remaining_time": "0:02:34"} -{"current_steps": 2018, "total_steps": 2050, "loss": 0.0025, "lr": 3.196225073842929e-09, "epoch": 9.84390243902439, "percentage": 98.44, "elapsed_time": "2:37:46", "remaining_time": "0:02:30"} -{"current_steps": 2019, "total_steps": 2050, "loss": 0.0005, "lr": 3.005487706958243e-09, "epoch": 9.848780487804879, "percentage": 98.49, "elapsed_time": "2:37:50", "remaining_time": "0:02:25"} -{"current_steps": 2020, "total_steps": 2050, "loss": 0.0026, "lr": 2.8206145502354678e-09, "epoch": 9.853658536585366, "percentage": 98.54, "elapsed_time": "2:37:55", "remaining_time": "0:02:20"} -{"current_steps": 2021, "total_steps": 2050, "loss": 0.0101, "lr": 2.641606037850353e-09, "epoch": 9.858536585365854, "percentage": 98.59, "elapsed_time": "2:37:58", "remaining_time": "0:02:16"} -{"current_steps": 2022, "total_steps": 2050, "loss": 0.0022, "lr": 2.468462590205778e-09, "epoch": 9.863414634146341, "percentage": 98.63, "elapsed_time": "2:38:03", "remaining_time": "0:02:11"} -{"current_steps": 2023, "total_steps": 2050, "loss": 0.0006, "lr": 2.3011846139306404e-09, "epoch": 9.86829268292683, "percentage": 98.68, "elapsed_time": "2:38:04", "remaining_time": "0:02:06"} -{"current_steps": 2024, "total_steps": 2050, "loss": 0.0018, "lr": 2.13977250187819e-09, "epoch": 9.873170731707317, "percentage": 98.73, "elapsed_time": "2:38:06", "remaining_time": "0:02:01"} -{"current_steps": 2025, "total_steps": 2050, "loss": 0.0031, "lr": 1.9842266331260296e-09, "epoch": 9.878048780487806, "percentage": 98.78, "elapsed_time": "2:38:10", "remaining_time": "0:01:57"} -{"current_steps": 2026, "total_steps": 2050, "loss": 0.0013, "lr": 1.834547372975004e-09, "epoch": 9.882926829268293, "percentage": 98.83, "elapsed_time": "2:38:13", "remaining_time": "0:01:52"} -{"current_steps": 2027, "total_steps": 2050, "loss": 0.0024, "lr": 1.6907350729478133e-09, "epoch": 9.887804878048781, "percentage": 98.88, "elapsed_time": "2:38:16", "remaining_time": "0:01:47"} -{"current_steps": 2028, "total_steps": 2050, "loss": 0.004, "lr": 1.5527900707887344e-09, "epoch": 9.892682926829268, "percentage": 98.93, "elapsed_time": "2:38:20", "remaining_time": "0:01:43"} -{"current_steps": 2029, "total_steps": 2050, "loss": 0.0097, "lr": 1.4207126904625114e-09, "epoch": 9.897560975609757, "percentage": 98.98, "elapsed_time": "2:38:24", "remaining_time": "0:01:38"} -{"current_steps": 2030, "total_steps": 2050, "loss": 0.0005, "lr": 1.2945032421540771e-09, "epoch": 9.902439024390244, "percentage": 99.02, "elapsed_time": "2:38:26", "remaining_time": "0:01:33"} -{"current_steps": 2031, "total_steps": 2050, "loss": 0.0041, "lr": 1.1741620222671667e-09, "epoch": 9.907317073170733, "percentage": 99.07, "elapsed_time": "2:38:29", "remaining_time": "0:01:28"} -{"current_steps": 2032, "total_steps": 2050, "loss": 0.0007, "lr": 1.0596893134240394e-09, "epoch": 9.91219512195122, "percentage": 99.12, "elapsed_time": "2:38:32", "remaining_time": "0:01:24"} -{"current_steps": 2033, "total_steps": 2050, "loss": 0.007, "lr": 9.51085384464645e-10, "epoch": 9.917073170731708, "percentage": 99.17, "elapsed_time": "2:38:35", "remaining_time": "0:01:19"} -{"current_steps": 2034, "total_steps": 2050, "loss": 0.0018, "lr": 8.48350490446348e-10, "epoch": 9.921951219512195, "percentage": 99.22, "elapsed_time": "2:38:37", "remaining_time": "0:01:14"} -{"current_steps": 2035, "total_steps": 2050, "loss": 0.0028, "lr": 7.514848726422608e-10, "epoch": 9.926829268292684, "percentage": 99.27, "elapsed_time": "2:38:40", "remaining_time": "0:01:10"} -{"current_steps": 2036, "total_steps": 2050, "loss": 0.0009, "lr": 6.604887585426323e-10, "epoch": 9.93170731707317, "percentage": 99.32, "elapsed_time": "2:38:45", "remaining_time": "0:01:05"} -{"current_steps": 2037, "total_steps": 2050, "loss": 0.0049, "lr": 5.753623618520721e-10, "epoch": 9.93658536585366, "percentage": 99.37, "elapsed_time": "2:38:49", "remaining_time": "0:01:00"} -{"current_steps": 2038, "total_steps": 2050, "loss": 0.0044, "lr": 4.961058824909382e-10, "epoch": 9.941463414634146, "percentage": 99.41, "elapsed_time": "2:38:54", "remaining_time": "0:00:56"} -{"current_steps": 2039, "total_steps": 2050, "loss": 0.0005, "lr": 4.2271950659311665e-10, "epoch": 9.946341463414633, "percentage": 99.46, "elapsed_time": "2:38:56", "remaining_time": "0:00:51"} -{"current_steps": 2040, "total_steps": 2050, "loss": 0.0012, "lr": 3.5520340650768705e-10, "epoch": 9.951219512195122, "percentage": 99.51, "elapsed_time": "2:38:58", "remaining_time": "0:00:46"} -{"current_steps": 2041, "total_steps": 2050, "loss": 0.0007, "lr": 2.9355774079614653e-10, "epoch": 9.95609756097561, "percentage": 99.56, "elapsed_time": "2:38:59", "remaining_time": "0:00:42"} -{"current_steps": 2042, "total_steps": 2050, "loss": 0.0003, "lr": 2.377826542343531e-10, "epoch": 9.960975609756098, "percentage": 99.61, "elapsed_time": "2:39:01", "remaining_time": "0:00:37"} -{"current_steps": 2043, "total_steps": 2050, "loss": 0.0005, "lr": 1.8787827781002743e-10, "epoch": 9.965853658536584, "percentage": 99.66, "elapsed_time": "2:39:06", "remaining_time": "0:00:32"} -{"current_steps": 2044, "total_steps": 2050, "loss": 0.0013, "lr": 1.4384472872414067e-10, "epoch": 9.970731707317073, "percentage": 99.71, "elapsed_time": "2:39:11", "remaining_time": "0:00:28"} -{"current_steps": 2045, "total_steps": 2050, "loss": 0.0006, "lr": 1.056821103900818e-10, "epoch": 9.975609756097562, "percentage": 99.76, "elapsed_time": "2:39:15", "remaining_time": "0:00:23"} -{"current_steps": 2046, "total_steps": 2050, "loss": 0.0011, "lr": 7.339051243254735e-11, "epoch": 9.980487804878049, "percentage": 99.8, "elapsed_time": "2:39:20", "remaining_time": "0:00:18"} -{"current_steps": 2047, "total_steps": 2050, "loss": 0.0202, "lr": 4.697001068892926e-11, "epoch": 9.985365853658536, "percentage": 99.85, "elapsed_time": "2:39:24", "remaining_time": "0:00:14"} -{"current_steps": 2048, "total_steps": 2050, "loss": 0.0005, "lr": 2.642066720792702e-11, "epoch": 9.990243902439024, "percentage": 99.9, "elapsed_time": "2:39:26", "remaining_time": "0:00:09"} -{"current_steps": 2049, "total_steps": 2050, "loss": 0.0017, "lr": 1.1742530249547745e-11, "epoch": 9.995121951219513, "percentage": 99.95, "elapsed_time": "2:39:29", "remaining_time": "0:00:04"} -{"current_steps": 2050, "total_steps": 2050, "loss": 0.0002, "lr": 2.9356342859387933e-12, "epoch": 10.0, "percentage": 100.0, "elapsed_time": "2:39:30", "remaining_time": "0:00:00"} -{"current_steps": 2050, "total_steps": 2050, "epoch": 10.0, "percentage": 100.0, "elapsed_time": "2:42:39", "remaining_time": "0:00:00"} diff --git a/metallama3_8b/limo/trainer_state.json b/metallama3_8b/limo/trainer_state.json deleted file mode 100644 index 40e32942f3322bd97537ed99e948d7fb148757f3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo/trainer_state.json +++ /dev/null @@ -1,14393 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 10.0, - "eval_steps": 500, - "global_step": 2050, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.004878048780487805, - "grad_norm": 27.79998016357422, - "learning_rate": 5e-06, - "loss": 1.4179, - "step": 1 - }, - { - "epoch": 0.00975609756097561, - "grad_norm": 4.086409091949463, - "learning_rate": 4.999997064365715e-06, - "loss": 1.1405, - "step": 2 - }, - { - "epoch": 0.014634146341463415, - "grad_norm": 4.499151229858398, - "learning_rate": 4.999988257469751e-06, - "loss": 0.8682, - "step": 3 - }, - { - "epoch": 0.01951219512195122, - "grad_norm": 4.555822849273682, - "learning_rate": 4.999973579332793e-06, - "loss": 0.9961, - "step": 4 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.6235246658325195, - "learning_rate": 4.999953029989312e-06, - "loss": 1.0173, - "step": 5 - }, - { - "epoch": 0.02926829268292683, - "grad_norm": 3.9943182468414307, - "learning_rate": 4.999926609487568e-06, - "loss": 1.1083, - "step": 6 - }, - { - "epoch": 0.03414634146341464, - "grad_norm": 5.685941219329834, - "learning_rate": 4.9998943178896106e-06, - "loss": 1.1109, - "step": 7 - }, - { - "epoch": 0.03902439024390244, - "grad_norm": 15.914257049560547, - "learning_rate": 4.999856155271276e-06, - "loss": 1.821, - "step": 8 - }, - { - "epoch": 0.04390243902439024, - "grad_norm": 4.147185325622559, - "learning_rate": 4.999812121722191e-06, - "loss": 1.0417, - "step": 9 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 11.123332977294922, - "learning_rate": 4.999762217345766e-06, - "loss": 1.5672, - "step": 10 - }, - { - "epoch": 0.05365853658536585, - "grad_norm": 2.842331886291504, - "learning_rate": 4.999706442259205e-06, - "loss": 0.7297, - "step": 11 - }, - { - "epoch": 0.05853658536585366, - "grad_norm": 37.685062408447266, - "learning_rate": 4.999644796593492e-06, - "loss": 0.9112, - "step": 12 - }, - { - "epoch": 0.06341463414634146, - "grad_norm": 11.214252471923828, - "learning_rate": 4.999577280493407e-06, - "loss": 0.7854, - "step": 13 - }, - { - "epoch": 0.06829268292682927, - "grad_norm": 5.10387659072876, - "learning_rate": 4.99950389411751e-06, - "loss": 1.1317, - "step": 14 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 3.685403347015381, - "learning_rate": 4.999424637638148e-06, - "loss": 0.7864, - "step": 15 - }, - { - "epoch": 0.07804878048780488, - "grad_norm": 2.9567184448242188, - "learning_rate": 4.999339511241458e-06, - "loss": 0.8494, - "step": 16 - }, - { - "epoch": 0.08292682926829269, - "grad_norm": 11.396956443786621, - "learning_rate": 4.9992485151273584e-06, - "loss": 1.2189, - "step": 17 - }, - { - "epoch": 0.08780487804878048, - "grad_norm": 7.007385730743408, - "learning_rate": 4.999151649509554e-06, - "loss": 1.0532, - "step": 18 - }, - { - "epoch": 0.09268292682926829, - "grad_norm": 3.4347329139709473, - "learning_rate": 4.9990489146155356e-06, - "loss": 1.088, - "step": 19 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 3.1865031719207764, - "learning_rate": 4.9989403106865765e-06, - "loss": 1.0414, - "step": 20 - }, - { - "epoch": 0.1024390243902439, - "grad_norm": 3.4605791568756104, - "learning_rate": 4.9988258379777334e-06, - "loss": 0.8878, - "step": 21 - }, - { - "epoch": 0.1073170731707317, - "grad_norm": 2.860478639602661, - "learning_rate": 4.998705496757846e-06, - "loss": 0.9151, - "step": 22 - }, - { - "epoch": 0.11219512195121951, - "grad_norm": 9.101946830749512, - "learning_rate": 4.998579287309538e-06, - "loss": 1.4304, - "step": 23 - }, - { - "epoch": 0.11707317073170732, - "grad_norm": 24.21122169494629, - "learning_rate": 4.998447209929211e-06, - "loss": 1.0858, - "step": 24 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 3.286980152130127, - "learning_rate": 4.998309264927053e-06, - "loss": 0.6571, - "step": 25 - }, - { - "epoch": 0.12682926829268293, - "grad_norm": 4.0232062339782715, - "learning_rate": 4.998165452627025e-06, - "loss": 0.8493, - "step": 26 - }, - { - "epoch": 0.13170731707317074, - "grad_norm": 3.7688663005828857, - "learning_rate": 4.998015773366874e-06, - "loss": 0.9224, - "step": 27 - }, - { - "epoch": 0.13658536585365855, - "grad_norm": 2.9382026195526123, - "learning_rate": 4.997860227498122e-06, - "loss": 0.7588, - "step": 28 - }, - { - "epoch": 0.14146341463414633, - "grad_norm": 4.327457904815674, - "learning_rate": 4.99769881538607e-06, - "loss": 1.1817, - "step": 29 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 3.47487735748291, - "learning_rate": 4.997531537409794e-06, - "loss": 1.0737, - "step": 30 - }, - { - "epoch": 0.15121951219512195, - "grad_norm": 3.0616214275360107, - "learning_rate": 4.99735839396215e-06, - "loss": 0.7899, - "step": 31 - }, - { - "epoch": 0.15609756097560976, - "grad_norm": 3.065070152282715, - "learning_rate": 4.9971793854497655e-06, - "loss": 0.7745, - "step": 32 - }, - { - "epoch": 0.16097560975609757, - "grad_norm": 3.5202279090881348, - "learning_rate": 4.996994512293042e-06, - "loss": 0.984, - "step": 33 - }, - { - "epoch": 0.16585365853658537, - "grad_norm": 3.421769142150879, - "learning_rate": 4.996803774926157e-06, - "loss": 0.8235, - "step": 34 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 4.6582207679748535, - "learning_rate": 4.996607173797059e-06, - "loss": 1.3227, - "step": 35 - }, - { - "epoch": 0.17560975609756097, - "grad_norm": 2.9829282760620117, - "learning_rate": 4.996404709367466e-06, - "loss": 0.8854, - "step": 36 - }, - { - "epoch": 0.18048780487804877, - "grad_norm": 2.5982632637023926, - "learning_rate": 4.996196382112868e-06, - "loss": 0.6786, - "step": 37 - }, - { - "epoch": 0.18536585365853658, - "grad_norm": 2.9807393550872803, - "learning_rate": 4.9959821925225235e-06, - "loss": 0.9344, - "step": 38 - }, - { - "epoch": 0.1902439024390244, - "grad_norm": 2.7364351749420166, - "learning_rate": 4.995762141099456e-06, - "loss": 0.814, - "step": 39 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 3.4324638843536377, - "learning_rate": 4.995536228360461e-06, - "loss": 1.0276, - "step": 40 - }, - { - "epoch": 0.2, - "grad_norm": 2.911834716796875, - "learning_rate": 4.995304454836095e-06, - "loss": 0.9291, - "step": 41 - }, - { - "epoch": 0.2048780487804878, - "grad_norm": 3.0294723510742188, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.8145, - "step": 42 - }, - { - "epoch": 0.2097560975609756, - "grad_norm": 4.681829452514648, - "learning_rate": 4.994823327622299e-06, - "loss": 0.8779, - "step": 43 - }, - { - "epoch": 0.2146341463414634, - "grad_norm": 3.643914222717285, - "learning_rate": 4.9945739750628e-06, - "loss": 0.8196, - "step": 44 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 2.7542076110839844, - "learning_rate": 4.994318763977789e-06, - "loss": 0.8443, - "step": 45 - }, - { - "epoch": 0.22439024390243903, - "grad_norm": 6.873605728149414, - "learning_rate": 4.994057694966632e-06, - "loss": 1.0328, - "step": 46 - }, - { - "epoch": 0.22926829268292684, - "grad_norm": 3.11810040473938, - "learning_rate": 4.993790768642449e-06, - "loss": 1.0673, - "step": 47 - }, - { - "epoch": 0.23414634146341465, - "grad_norm": 4.360548496246338, - "learning_rate": 4.99351798563212e-06, - "loss": 1.3198, - "step": 48 - }, - { - "epoch": 0.23902439024390243, - "grad_norm": 2.6894314289093018, - "learning_rate": 4.993239346576278e-06, - "loss": 0.8743, - "step": 49 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 3.2640421390533447, - "learning_rate": 4.99295485212931e-06, - "loss": 1.109, - "step": 50 - }, - { - "epoch": 0.24878048780487805, - "grad_norm": 3.1565866470336914, - "learning_rate": 4.992664502959351e-06, - "loss": 0.9291, - "step": 51 - }, - { - "epoch": 0.25365853658536586, - "grad_norm": 3.4829447269439697, - "learning_rate": 4.99236829974829e-06, - "loss": 0.8159, - "step": 52 - }, - { - "epoch": 0.25853658536585367, - "grad_norm": 2.7535626888275146, - "learning_rate": 4.992066243191762e-06, - "loss": 1.0359, - "step": 53 - }, - { - "epoch": 0.2634146341463415, - "grad_norm": 2.482935905456543, - "learning_rate": 4.991758333999148e-06, - "loss": 0.8091, - "step": 54 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 2.917445659637451, - "learning_rate": 4.991444572893575e-06, - "loss": 0.6925, - "step": 55 - }, - { - "epoch": 0.2731707317073171, - "grad_norm": 2.9802236557006836, - "learning_rate": 4.991124960611916e-06, - "loss": 0.6329, - "step": 56 - }, - { - "epoch": 0.2780487804878049, - "grad_norm": 2.9677224159240723, - "learning_rate": 4.99079949790478e-06, - "loss": 0.8069, - "step": 57 - }, - { - "epoch": 0.28292682926829266, - "grad_norm": 2.8304293155670166, - "learning_rate": 4.99046818553652e-06, - "loss": 0.8682, - "step": 58 - }, - { - "epoch": 0.28780487804878047, - "grad_norm": 5.253443717956543, - "learning_rate": 4.9901310242852246e-06, - "loss": 1.1069, - "step": 59 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 3.686016082763672, - "learning_rate": 4.9897880149427206e-06, - "loss": 0.9465, - "step": 60 - }, - { - "epoch": 0.2975609756097561, - "grad_norm": 3.6372263431549072, - "learning_rate": 4.989439158314566e-06, - "loss": 0.9738, - "step": 61 - }, - { - "epoch": 0.3024390243902439, - "grad_norm": 3.0756819248199463, - "learning_rate": 4.989084455220056e-06, - "loss": 0.6417, - "step": 62 - }, - { - "epoch": 0.3073170731707317, - "grad_norm": 3.379222869873047, - "learning_rate": 4.988723906492212e-06, - "loss": 1.0092, - "step": 63 - }, - { - "epoch": 0.3121951219512195, - "grad_norm": 3.4571032524108887, - "learning_rate": 4.988357512977785e-06, - "loss": 0.6691, - "step": 64 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 3.1982104778289795, - "learning_rate": 4.987985275537252e-06, - "loss": 0.6651, - "step": 65 - }, - { - "epoch": 0.32195121951219513, - "grad_norm": 2.9723124504089355, - "learning_rate": 4.9876071950448185e-06, - "loss": 0.9227, - "step": 66 - }, - { - "epoch": 0.32682926829268294, - "grad_norm": 2.5521399974823, - "learning_rate": 4.987223272388407e-06, - "loss": 0.6664, - "step": 67 - }, - { - "epoch": 0.33170731707317075, - "grad_norm": 2.8934121131896973, - "learning_rate": 4.986833508469663e-06, - "loss": 0.997, - "step": 68 - }, - { - "epoch": 0.33658536585365856, - "grad_norm": 4.7546586990356445, - "learning_rate": 4.98643790420395e-06, - "loss": 0.8551, - "step": 69 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 3.091616153717041, - "learning_rate": 4.986036460520348e-06, - "loss": 0.8874, - "step": 70 - }, - { - "epoch": 0.3463414634146341, - "grad_norm": 4.1724677085876465, - "learning_rate": 4.98562917836165e-06, - "loss": 1.1393, - "step": 71 - }, - { - "epoch": 0.35121951219512193, - "grad_norm": 2.6568572521209717, - "learning_rate": 4.985216058684362e-06, - "loss": 0.6379, - "step": 72 - }, - { - "epoch": 0.35609756097560974, - "grad_norm": 2.396416187286377, - "learning_rate": 4.984797102458697e-06, - "loss": 1.0292, - "step": 73 - }, - { - "epoch": 0.36097560975609755, - "grad_norm": 3.0667319297790527, - "learning_rate": 4.984372310668579e-06, - "loss": 0.7048, - "step": 74 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 2.4820518493652344, - "learning_rate": 4.983941684311633e-06, - "loss": 1.2353, - "step": 75 - }, - { - "epoch": 0.37073170731707317, - "grad_norm": 4.062836647033691, - "learning_rate": 4.983505224399188e-06, - "loss": 0.8933, - "step": 76 - }, - { - "epoch": 0.375609756097561, - "grad_norm": 2.4480767250061035, - "learning_rate": 4.983062931956275e-06, - "loss": 0.8221, - "step": 77 - }, - { - "epoch": 0.3804878048780488, - "grad_norm": 3.134138822555542, - "learning_rate": 4.9826148080216195e-06, - "loss": 0.8899, - "step": 78 - }, - { - "epoch": 0.3853658536585366, - "grad_norm": 2.8165836334228516, - "learning_rate": 4.9821608536476445e-06, - "loss": 1.2451, - "step": 79 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 3.734433650970459, - "learning_rate": 4.981701069900465e-06, - "loss": 0.8536, - "step": 80 - }, - { - "epoch": 0.3951219512195122, - "grad_norm": 2.853421449661255, - "learning_rate": 4.9812354578598876e-06, - "loss": 0.7857, - "step": 81 - }, - { - "epoch": 0.4, - "grad_norm": 2.541687250137329, - "learning_rate": 4.980764018619405e-06, - "loss": 0.8332, - "step": 82 - }, - { - "epoch": 0.40487804878048783, - "grad_norm": 4.405911445617676, - "learning_rate": 4.980286753286196e-06, - "loss": 0.9927, - "step": 83 - }, - { - "epoch": 0.4097560975609756, - "grad_norm": 3.3034985065460205, - "learning_rate": 4.97980366298112e-06, - "loss": 0.8161, - "step": 84 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 2.6678085327148438, - "learning_rate": 4.97931474883872e-06, - "loss": 0.8017, - "step": 85 - }, - { - "epoch": 0.4195121951219512, - "grad_norm": 2.58524227142334, - "learning_rate": 4.978820012007213e-06, - "loss": 0.8811, - "step": 86 - }, - { - "epoch": 0.424390243902439, - "grad_norm": 2.482597827911377, - "learning_rate": 4.978319453648495e-06, - "loss": 0.9461, - "step": 87 - }, - { - "epoch": 0.4292682926829268, - "grad_norm": 2.5731301307678223, - "learning_rate": 4.977813074938128e-06, - "loss": 0.8835, - "step": 88 - }, - { - "epoch": 0.43414634146341463, - "grad_norm": 2.7914488315582275, - "learning_rate": 4.977300877065347e-06, - "loss": 0.8466, - "step": 89 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 2.416043758392334, - "learning_rate": 4.976782861233053e-06, - "loss": 0.7132, - "step": 90 - }, - { - "epoch": 0.44390243902439025, - "grad_norm": 3.7616264820098877, - "learning_rate": 4.976259028657812e-06, - "loss": 0.7639, - "step": 91 - }, - { - "epoch": 0.44878048780487806, - "grad_norm": 2.6081621646881104, - "learning_rate": 4.975729380569845e-06, - "loss": 0.8055, - "step": 92 - }, - { - "epoch": 0.45365853658536587, - "grad_norm": 3.3343570232391357, - "learning_rate": 4.975193918213035e-06, - "loss": 0.6042, - "step": 93 - }, - { - "epoch": 0.4585365853658537, - "grad_norm": 2.517544746398926, - "learning_rate": 4.974652642844921e-06, - "loss": 0.7672, - "step": 94 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 4.173468589782715, - "learning_rate": 4.974105555736693e-06, - "loss": 1.0682, - "step": 95 - }, - { - "epoch": 0.4682926829268293, - "grad_norm": 2.8422317504882812, - "learning_rate": 4.973552658173186e-06, - "loss": 0.7841, - "step": 96 - }, - { - "epoch": 0.47317073170731705, - "grad_norm": 5.042182445526123, - "learning_rate": 4.972993951452887e-06, - "loss": 0.8851, - "step": 97 - }, - { - "epoch": 0.47804878048780486, - "grad_norm": 5.977590560913086, - "learning_rate": 4.9724294368879214e-06, - "loss": 0.9059, - "step": 98 - }, - { - "epoch": 0.48292682926829267, - "grad_norm": 4.227641582489014, - "learning_rate": 4.971859115804055e-06, - "loss": 1.0152, - "step": 99 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 3.180952548980713, - "learning_rate": 4.9712829895406935e-06, - "loss": 0.8092, - "step": 100 - }, - { - "epoch": 0.4926829268292683, - "grad_norm": 11.220394134521484, - "learning_rate": 4.970701059450872e-06, - "loss": 0.8239, - "step": 101 - }, - { - "epoch": 0.4975609756097561, - "grad_norm": 2.346975088119507, - "learning_rate": 4.970113326901258e-06, - "loss": 0.9283, - "step": 102 - }, - { - "epoch": 0.5024390243902439, - "grad_norm": 2.9470982551574707, - "learning_rate": 4.9695197932721455e-06, - "loss": 0.9429, - "step": 103 - }, - { - "epoch": 0.5073170731707317, - "grad_norm": 3.6048219203948975, - "learning_rate": 4.968920459957453e-06, - "loss": 0.9231, - "step": 104 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 2.8181886672973633, - "learning_rate": 4.968315328364719e-06, - "loss": 1.0005, - "step": 105 - }, - { - "epoch": 0.5170731707317073, - "grad_norm": 3.114147424697876, - "learning_rate": 4.9677043999151e-06, - "loss": 1.1326, - "step": 106 - }, - { - "epoch": 0.5219512195121951, - "grad_norm": 2.965885639190674, - "learning_rate": 4.967087676043366e-06, - "loss": 0.541, - "step": 107 - }, - { - "epoch": 0.526829268292683, - "grad_norm": 3.098677635192871, - "learning_rate": 4.966465158197897e-06, - "loss": 0.9473, - "step": 108 - }, - { - "epoch": 0.5317073170731708, - "grad_norm": 2.8640191555023193, - "learning_rate": 4.965836847840681e-06, - "loss": 0.6678, - "step": 109 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 3.0950934886932373, - "learning_rate": 4.96520274644731e-06, - "loss": 0.9251, - "step": 110 - }, - { - "epoch": 0.5414634146341464, - "grad_norm": 2.99444317817688, - "learning_rate": 4.964562855506976e-06, - "loss": 0.7807, - "step": 111 - }, - { - "epoch": 0.5463414634146342, - "grad_norm": 2.348639726638794, - "learning_rate": 4.963917176522466e-06, - "loss": 0.6395, - "step": 112 - }, - { - "epoch": 0.551219512195122, - "grad_norm": 3.5988354682922363, - "learning_rate": 4.963265711010164e-06, - "loss": 1.0658, - "step": 113 - }, - { - "epoch": 0.5560975609756098, - "grad_norm": 3.3423564434051514, - "learning_rate": 4.9626084605000395e-06, - "loss": 0.8974, - "step": 114 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 2.8353331089019775, - "learning_rate": 4.961945426535652e-06, - "loss": 0.6144, - "step": 115 - }, - { - "epoch": 0.5658536585365853, - "grad_norm": 2.752387046813965, - "learning_rate": 4.961276610674141e-06, - "loss": 0.9083, - "step": 116 - }, - { - "epoch": 0.5707317073170731, - "grad_norm": 2.2654404640197754, - "learning_rate": 4.960602014486225e-06, - "loss": 1.0101, - "step": 117 - }, - { - "epoch": 0.5756097560975609, - "grad_norm": 3.344377040863037, - "learning_rate": 4.959921639556199e-06, - "loss": 0.8391, - "step": 118 - }, - { - "epoch": 0.5804878048780487, - "grad_norm": 3.1620500087738037, - "learning_rate": 4.959235487481928e-06, - "loss": 1.0431, - "step": 119 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 2.857048273086548, - "learning_rate": 4.958543559874846e-06, - "loss": 0.5864, - "step": 120 - }, - { - "epoch": 0.5902439024390244, - "grad_norm": 3.1736063957214355, - "learning_rate": 4.9578458583599495e-06, - "loss": 0.7868, - "step": 121 - }, - { - "epoch": 0.5951219512195122, - "grad_norm": 3.5520827770233154, - "learning_rate": 4.957142384575795e-06, - "loss": 0.7901, - "step": 122 - }, - { - "epoch": 0.6, - "grad_norm": 3.265103578567505, - "learning_rate": 4.956433140174498e-06, - "loss": 0.9067, - "step": 123 - }, - { - "epoch": 0.6048780487804878, - "grad_norm": 3.1181187629699707, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.8971, - "step": 124 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 2.4123694896698, - "learning_rate": 4.954997346196683e-06, - "loss": 1.2123, - "step": 125 - }, - { - "epoch": 0.6146341463414634, - "grad_norm": 2.9646875858306885, - "learning_rate": 4.954270799992138e-06, - "loss": 0.7696, - "step": 126 - }, - { - "epoch": 0.6195121951219512, - "grad_norm": 2.7457995414733887, - "learning_rate": 4.953538489914387e-06, - "loss": 0.7919, - "step": 127 - }, - { - "epoch": 0.624390243902439, - "grad_norm": 5.096850395202637, - "learning_rate": 4.9528004176832654e-06, - "loss": 0.6494, - "step": 128 - }, - { - "epoch": 0.6292682926829268, - "grad_norm": 3.124955177307129, - "learning_rate": 4.952056585032142e-06, - "loss": 1.0546, - "step": 129 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 2.4860167503356934, - "learning_rate": 4.951306993707913e-06, - "loss": 0.7907, - "step": 130 - }, - { - "epoch": 0.6390243902439025, - "grad_norm": 2.3380239009857178, - "learning_rate": 4.950551645470998e-06, - "loss": 0.7433, - "step": 131 - }, - { - "epoch": 0.6439024390243903, - "grad_norm": 2.8945236206054688, - "learning_rate": 4.9497905420953406e-06, - "loss": 0.7682, - "step": 132 - }, - { - "epoch": 0.6487804878048781, - "grad_norm": 3.429776430130005, - "learning_rate": 4.949023685368395e-06, - "loss": 0.8411, - "step": 133 - }, - { - "epoch": 0.6536585365853659, - "grad_norm": 2.8853516578674316, - "learning_rate": 4.948251077091131e-06, - "loss": 1.0792, - "step": 134 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 2.145598888397217, - "learning_rate": 4.947472719078025e-06, - "loss": 0.8033, - "step": 135 - }, - { - "epoch": 0.6634146341463415, - "grad_norm": 2.5064377784729004, - "learning_rate": 4.9466886131570565e-06, - "loss": 0.939, - "step": 136 - }, - { - "epoch": 0.6682926829268293, - "grad_norm": 2.5700225830078125, - "learning_rate": 4.945898761169704e-06, - "loss": 1.0418, - "step": 137 - }, - { - "epoch": 0.6731707317073171, - "grad_norm": 2.3390917778015137, - "learning_rate": 4.945103164970941e-06, - "loss": 0.6158, - "step": 138 - }, - { - "epoch": 0.6780487804878049, - "grad_norm": 2.1538751125335693, - "learning_rate": 4.9443018264292304e-06, - "loss": 0.6995, - "step": 139 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 5.255710601806641, - "learning_rate": 4.9434947474265225e-06, - "loss": 1.0382, - "step": 140 - }, - { - "epoch": 0.6878048780487804, - "grad_norm": 2.5547356605529785, - "learning_rate": 4.942681929858249e-06, - "loss": 1.037, - "step": 141 - }, - { - "epoch": 0.6926829268292682, - "grad_norm": 2.613280773162842, - "learning_rate": 4.941863375633315e-06, - "loss": 0.9071, - "step": 142 - }, - { - "epoch": 0.697560975609756, - "grad_norm": 2.9957327842712402, - "learning_rate": 4.9410390866741056e-06, - "loss": 0.7908, - "step": 143 - }, - { - "epoch": 0.7024390243902439, - "grad_norm": 2.410107374191284, - "learning_rate": 4.9402090649164655e-06, - "loss": 0.7739, - "step": 144 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 2.352013349533081, - "learning_rate": 4.9393733123097085e-06, - "loss": 0.939, - "step": 145 - }, - { - "epoch": 0.7121951219512195, - "grad_norm": 2.5164194107055664, - "learning_rate": 4.9385318308166065e-06, - "loss": 0.8729, - "step": 146 - }, - { - "epoch": 0.7170731707317073, - "grad_norm": 4.213881015777588, - "learning_rate": 4.937684622413385e-06, - "loss": 0.6124, - "step": 147 - }, - { - "epoch": 0.7219512195121951, - "grad_norm": 2.7950191497802734, - "learning_rate": 4.9368316890897185e-06, - "loss": 0.975, - "step": 148 - }, - { - "epoch": 0.7268292682926829, - "grad_norm": 2.8618874549865723, - "learning_rate": 4.9359730328487264e-06, - "loss": 0.5832, - "step": 149 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 2.6943812370300293, - "learning_rate": 4.935108655706972e-06, - "loss": 0.8124, - "step": 150 - }, - { - "epoch": 0.7365853658536585, - "grad_norm": 3.2164082527160645, - "learning_rate": 4.934238559694448e-06, - "loss": 1.1446, - "step": 151 - }, - { - "epoch": 0.7414634146341463, - "grad_norm": 3.05002498626709, - "learning_rate": 4.9333627468545845e-06, - "loss": 0.7884, - "step": 152 - }, - { - "epoch": 0.7463414634146341, - "grad_norm": 2.863351583480835, - "learning_rate": 4.932481219244231e-06, - "loss": 0.7918, - "step": 153 - }, - { - "epoch": 0.751219512195122, - "grad_norm": 2.4947102069854736, - "learning_rate": 4.931593978933666e-06, - "loss": 0.775, - "step": 154 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 2.918886184692383, - "learning_rate": 4.930701028006577e-06, - "loss": 0.993, - "step": 155 - }, - { - "epoch": 0.7609756097560976, - "grad_norm": 2.835956573486328, - "learning_rate": 4.929802368560066e-06, - "loss": 0.7911, - "step": 156 - }, - { - "epoch": 0.7658536585365854, - "grad_norm": 3.3073575496673584, - "learning_rate": 4.928898002704642e-06, - "loss": 0.9346, - "step": 157 - }, - { - "epoch": 0.7707317073170732, - "grad_norm": 3.086146354675293, - "learning_rate": 4.927987932564215e-06, - "loss": 0.817, - "step": 158 - }, - { - "epoch": 0.775609756097561, - "grad_norm": 2.5419743061065674, - "learning_rate": 4.927072160276092e-06, - "loss": 0.7918, - "step": 159 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 3.984297275543213, - "learning_rate": 4.926150687990969e-06, - "loss": 0.7153, - "step": 160 - }, - { - "epoch": 0.7853658536585366, - "grad_norm": 2.4703335762023926, - "learning_rate": 4.925223517872934e-06, - "loss": 0.8982, - "step": 161 - }, - { - "epoch": 0.7902439024390244, - "grad_norm": 2.81785249710083, - "learning_rate": 4.9242906520994484e-06, - "loss": 0.9839, - "step": 162 - }, - { - "epoch": 0.7951219512195122, - "grad_norm": 2.3304924964904785, - "learning_rate": 4.923352092861358e-06, - "loss": 0.8406, - "step": 163 - }, - { - "epoch": 0.8, - "grad_norm": 2.339498519897461, - "learning_rate": 4.922407842362875e-06, - "loss": 0.6602, - "step": 164 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 3.488255262374878, - "learning_rate": 4.921457902821578e-06, - "loss": 0.9779, - "step": 165 - }, - { - "epoch": 0.8097560975609757, - "grad_norm": 2.8528945446014404, - "learning_rate": 4.920502276468408e-06, - "loss": 0.8821, - "step": 166 - }, - { - "epoch": 0.8146341463414634, - "grad_norm": 3.4649784564971924, - "learning_rate": 4.9195409655476605e-06, - "loss": 0.7539, - "step": 167 - }, - { - "epoch": 0.8195121951219512, - "grad_norm": 2.3109042644500732, - "learning_rate": 4.918573972316982e-06, - "loss": 0.9807, - "step": 168 - }, - { - "epoch": 0.824390243902439, - "grad_norm": 2.678666353225708, - "learning_rate": 4.917601299047361e-06, - "loss": 0.8318, - "step": 169 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 2.730614185333252, - "learning_rate": 4.916622948023129e-06, - "loss": 0.7816, - "step": 170 - }, - { - "epoch": 0.8341463414634146, - "grad_norm": 2.9835665225982666, - "learning_rate": 4.915638921541952e-06, - "loss": 0.6633, - "step": 171 - }, - { - "epoch": 0.8390243902439024, - "grad_norm": 3.31217360496521, - "learning_rate": 4.914649221914822e-06, - "loss": 0.9296, - "step": 172 - }, - { - "epoch": 0.8439024390243902, - "grad_norm": 2.9021658897399902, - "learning_rate": 4.913653851466057e-06, - "loss": 0.6864, - "step": 173 - }, - { - "epoch": 0.848780487804878, - "grad_norm": 3.3672914505004883, - "learning_rate": 4.912652812533291e-06, - "loss": 0.8599, - "step": 174 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 2.4871644973754883, - "learning_rate": 4.911646107467472e-06, - "loss": 0.8949, - "step": 175 - }, - { - "epoch": 0.8585365853658536, - "grad_norm": 2.728022813796997, - "learning_rate": 4.9106337386328524e-06, - "loss": 0.9758, - "step": 176 - }, - { - "epoch": 0.8634146341463415, - "grad_norm": 2.704252243041992, - "learning_rate": 4.909615708406991e-06, - "loss": 0.8954, - "step": 177 - }, - { - "epoch": 0.8682926829268293, - "grad_norm": 2.4002223014831543, - "learning_rate": 4.908592019180738e-06, - "loss": 0.7157, - "step": 178 - }, - { - "epoch": 0.8731707317073171, - "grad_norm": 2.1927788257598877, - "learning_rate": 4.907562673358234e-06, - "loss": 0.6358, - "step": 179 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 2.458500623703003, - "learning_rate": 4.906527673356907e-06, - "loss": 0.6685, - "step": 180 - }, - { - "epoch": 0.8829268292682927, - "grad_norm": 2.5924787521362305, - "learning_rate": 4.905487021607462e-06, - "loss": 0.5686, - "step": 181 - }, - { - "epoch": 0.8878048780487805, - "grad_norm": 3.0923380851745605, - "learning_rate": 4.904440720553876e-06, - "loss": 0.8538, - "step": 182 - }, - { - "epoch": 0.8926829268292683, - "grad_norm": 2.8001527786254883, - "learning_rate": 4.903388772653396e-06, - "loss": 0.8292, - "step": 183 - }, - { - "epoch": 0.8975609756097561, - "grad_norm": 2.4344072341918945, - "learning_rate": 4.902331180376529e-06, - "loss": 0.7946, - "step": 184 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 2.6313226222991943, - "learning_rate": 4.901267946207038e-06, - "loss": 0.9269, - "step": 185 - }, - { - "epoch": 0.9073170731707317, - "grad_norm": 2.4776692390441895, - "learning_rate": 4.900199072641937e-06, - "loss": 0.7433, - "step": 186 - }, - { - "epoch": 0.9121951219512195, - "grad_norm": 2.339869260787964, - "learning_rate": 4.899124562191484e-06, - "loss": 0.6577, - "step": 187 - }, - { - "epoch": 0.9170731707317074, - "grad_norm": 3.076890468597412, - "learning_rate": 4.8980444173791735e-06, - "loss": 0.5989, - "step": 188 - }, - { - "epoch": 0.9219512195121952, - "grad_norm": 2.83957839012146, - "learning_rate": 4.896958640741735e-06, - "loss": 0.9364, - "step": 189 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 2.770867347717285, - "learning_rate": 4.895867234829121e-06, - "loss": 1.0328, - "step": 190 - }, - { - "epoch": 0.9317073170731708, - "grad_norm": 2.7819619178771973, - "learning_rate": 4.894770202204509e-06, - "loss": 0.772, - "step": 191 - }, - { - "epoch": 0.9365853658536586, - "grad_norm": 3.925703763961792, - "learning_rate": 4.893667545444285e-06, - "loss": 0.8128, - "step": 192 - }, - { - "epoch": 0.9414634146341463, - "grad_norm": 3.034944534301758, - "learning_rate": 4.8925592671380495e-06, - "loss": 0.7418, - "step": 193 - }, - { - "epoch": 0.9463414634146341, - "grad_norm": 2.3350143432617188, - "learning_rate": 4.891445369888601e-06, - "loss": 0.5979, - "step": 194 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 2.6433160305023193, - "learning_rate": 4.890325856311936e-06, - "loss": 0.9664, - "step": 195 - }, - { - "epoch": 0.9560975609756097, - "grad_norm": 2.715142011642456, - "learning_rate": 4.889200729037241e-06, - "loss": 0.8482, - "step": 196 - }, - { - "epoch": 0.9609756097560975, - "grad_norm": 2.6157352924346924, - "learning_rate": 4.888069990706884e-06, - "loss": 0.7173, - "step": 197 - }, - { - "epoch": 0.9658536585365853, - "grad_norm": 3.7308952808380127, - "learning_rate": 4.886933643976414e-06, - "loss": 0.5433, - "step": 198 - }, - { - "epoch": 0.9707317073170731, - "grad_norm": 3.1134045124053955, - "learning_rate": 4.885791691514548e-06, - "loss": 0.5997, - "step": 199 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 2.421365976333618, - "learning_rate": 4.884644136003172e-06, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.9804878048780488, - "grad_norm": 2.8676180839538574, - "learning_rate": 4.883490980137327e-06, - "loss": 1.3465, - "step": 201 - }, - { - "epoch": 0.9853658536585366, - "grad_norm": 2.236189603805542, - "learning_rate": 4.882332226625208e-06, - "loss": 0.7533, - "step": 202 - }, - { - "epoch": 0.9902439024390244, - "grad_norm": 2.2514970302581787, - "learning_rate": 4.881167878188158e-06, - "loss": 0.8555, - "step": 203 - }, - { - "epoch": 0.9951219512195122, - "grad_norm": 2.6856095790863037, - "learning_rate": 4.8799979375606565e-06, - "loss": 0.7634, - "step": 204 - }, - { - "epoch": 1.0, - "grad_norm": 2.5563852787017822, - "learning_rate": 4.878822407490319e-06, - "loss": 0.66, - "step": 205 - }, - { - "epoch": 1.0048780487804878, - "grad_norm": 4.7092814445495605, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7429, - "step": 206 - }, - { - "epoch": 1.0097560975609756, - "grad_norm": 2.9133448600769043, - "learning_rate": 4.876454590077216e-06, - "loss": 0.5735, - "step": 207 - }, - { - "epoch": 1.0146341463414634, - "grad_norm": 2.7012641429901123, - "learning_rate": 4.875262308295289e-06, - "loss": 0.8065, - "step": 208 - }, - { - "epoch": 1.0195121951219512, - "grad_norm": 3.703998327255249, - "learning_rate": 4.874064448192185e-06, - "loss": 0.7148, - "step": 209 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 3.044930934906006, - "learning_rate": 4.872861012581088e-06, - "loss": 0.5606, - "step": 210 - }, - { - "epoch": 1.0292682926829269, - "grad_norm": 3.661381244659424, - "learning_rate": 4.871652004288275e-06, - "loss": 0.6492, - "step": 211 - }, - { - "epoch": 1.0341463414634147, - "grad_norm": 3.18344783782959, - "learning_rate": 4.870437426153113e-06, - "loss": 0.633, - "step": 212 - }, - { - "epoch": 1.0390243902439025, - "grad_norm": 4.596707820892334, - "learning_rate": 4.869217281028045e-06, - "loss": 0.842, - "step": 213 - }, - { - "epoch": 1.0439024390243903, - "grad_norm": 4.116331577301025, - "learning_rate": 4.867991571778592e-06, - "loss": 0.8371, - "step": 214 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 3.152939558029175, - "learning_rate": 4.866760301283342e-06, - "loss": 0.4728, - "step": 215 - }, - { - "epoch": 1.053658536585366, - "grad_norm": 2.8732805252075195, - "learning_rate": 4.865523472433942e-06, - "loss": 0.651, - "step": 216 - }, - { - "epoch": 1.0585365853658537, - "grad_norm": 2.967480421066284, - "learning_rate": 4.8642810881350935e-06, - "loss": 0.6361, - "step": 217 - }, - { - "epoch": 1.0634146341463415, - "grad_norm": 2.816798210144043, - "learning_rate": 4.863033151304546e-06, - "loss": 0.6206, - "step": 218 - }, - { - "epoch": 1.0682926829268293, - "grad_norm": 3.168349027633667, - "learning_rate": 4.861779664873088e-06, - "loss": 0.7782, - "step": 219 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 3.7496471405029297, - "learning_rate": 4.8605206317845425e-06, - "loss": 0.8504, - "step": 220 - }, - { - "epoch": 1.078048780487805, - "grad_norm": 2.7087056636810303, - "learning_rate": 4.859256054995758e-06, - "loss": 0.7771, - "step": 221 - }, - { - "epoch": 1.0829268292682928, - "grad_norm": 2.803703546524048, - "learning_rate": 4.8579859374766e-06, - "loss": 0.4308, - "step": 222 - }, - { - "epoch": 1.0878048780487806, - "grad_norm": 2.4199142456054688, - "learning_rate": 4.856710282209952e-06, - "loss": 0.3739, - "step": 223 - }, - { - "epoch": 1.0926829268292684, - "grad_norm": 2.384037494659424, - "learning_rate": 4.855429092191698e-06, - "loss": 0.6548, - "step": 224 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 3.0230021476745605, - "learning_rate": 4.854142370430725e-06, - "loss": 0.6932, - "step": 225 - }, - { - "epoch": 1.102439024390244, - "grad_norm": 3.0248661041259766, - "learning_rate": 4.8528501199489045e-06, - "loss": 0.6491, - "step": 226 - }, - { - "epoch": 1.1073170731707318, - "grad_norm": 4.046666145324707, - "learning_rate": 4.851552343781099e-06, - "loss": 0.7946, - "step": 227 - }, - { - "epoch": 1.1121951219512196, - "grad_norm": 2.8751168251037598, - "learning_rate": 4.850249044975145e-06, - "loss": 0.7629, - "step": 228 - }, - { - "epoch": 1.1170731707317074, - "grad_norm": 2.8649816513061523, - "learning_rate": 4.848940226591849e-06, - "loss": 0.9114, - "step": 229 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 3.2590744495391846, - "learning_rate": 4.847625891704982e-06, - "loss": 0.535, - "step": 230 - }, - { - "epoch": 1.126829268292683, - "grad_norm": 3.230659008026123, - "learning_rate": 4.846306043401268e-06, - "loss": 0.7134, - "step": 231 - }, - { - "epoch": 1.1317073170731708, - "grad_norm": 3.5220088958740234, - "learning_rate": 4.844980684780381e-06, - "loss": 0.5375, - "step": 232 - }, - { - "epoch": 1.1365853658536587, - "grad_norm": 3.074052095413208, - "learning_rate": 4.8436498189549345e-06, - "loss": 0.5486, - "step": 233 - }, - { - "epoch": 1.1414634146341462, - "grad_norm": 2.511216163635254, - "learning_rate": 4.842313449050477e-06, - "loss": 0.5203, - "step": 234 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 2.6082136631011963, - "learning_rate": 4.840971578205486e-06, - "loss": 0.4978, - "step": 235 - }, - { - "epoch": 1.1512195121951219, - "grad_norm": 2.4481778144836426, - "learning_rate": 4.839624209571352e-06, - "loss": 0.348, - "step": 236 - }, - { - "epoch": 1.1560975609756097, - "grad_norm": 2.7532148361206055, - "learning_rate": 4.838271346312381e-06, - "loss": 0.8068, - "step": 237 - }, - { - "epoch": 1.1609756097560975, - "grad_norm": 2.6562349796295166, - "learning_rate": 4.836912991605782e-06, - "loss": 0.8823, - "step": 238 - }, - { - "epoch": 1.1658536585365853, - "grad_norm": 3.032168388366699, - "learning_rate": 4.835549148641663e-06, - "loss": 0.501, - "step": 239 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 3.4816956520080566, - "learning_rate": 4.834179820623018e-06, - "loss": 0.6406, - "step": 240 - }, - { - "epoch": 1.175609756097561, - "grad_norm": 2.480642318725586, - "learning_rate": 4.832805010765724e-06, - "loss": 0.537, - "step": 241 - }, - { - "epoch": 1.1804878048780487, - "grad_norm": 2.7662222385406494, - "learning_rate": 4.831424722298531e-06, - "loss": 0.6464, - "step": 242 - }, - { - "epoch": 1.1853658536585365, - "grad_norm": 3.2929866313934326, - "learning_rate": 4.830038958463061e-06, - "loss": 0.6888, - "step": 243 - }, - { - "epoch": 1.1902439024390243, - "grad_norm": 5.094089031219482, - "learning_rate": 4.828647722513785e-06, - "loss": 0.8342, - "step": 244 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 3.6679818630218506, - "learning_rate": 4.827251017718034e-06, - "loss": 0.7849, - "step": 245 - }, - { - "epoch": 1.2, - "grad_norm": 3.97290301322937, - "learning_rate": 4.8258488473559794e-06, - "loss": 0.7995, - "step": 246 - }, - { - "epoch": 1.2048780487804878, - "grad_norm": 3.3555023670196533, - "learning_rate": 4.824441214720629e-06, - "loss": 0.8718, - "step": 247 - }, - { - "epoch": 1.2097560975609756, - "grad_norm": 2.309361219406128, - "learning_rate": 4.823028123117818e-06, - "loss": 0.3731, - "step": 248 - }, - { - "epoch": 1.2146341463414634, - "grad_norm": 2.607269763946533, - "learning_rate": 4.8216095758662015e-06, - "loss": 0.7321, - "step": 249 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 2.5667428970336914, - "learning_rate": 4.82018557629725e-06, - "loss": 0.7561, - "step": 250 - }, - { - "epoch": 1.224390243902439, - "grad_norm": 2.7664871215820312, - "learning_rate": 4.8187561277552376e-06, - "loss": 0.638, - "step": 251 - }, - { - "epoch": 1.2292682926829268, - "grad_norm": 2.2880401611328125, - "learning_rate": 4.817321233597232e-06, - "loss": 0.6996, - "step": 252 - }, - { - "epoch": 1.2341463414634146, - "grad_norm": 2.7615559101104736, - "learning_rate": 4.815880897193095e-06, - "loss": 0.5432, - "step": 253 - }, - { - "epoch": 1.2390243902439024, - "grad_norm": 2.9052155017852783, - "learning_rate": 4.814435121925466e-06, - "loss": 0.781, - "step": 254 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 3.2035205364227295, - "learning_rate": 4.812983911189761e-06, - "loss": 0.6884, - "step": 255 - }, - { - "epoch": 1.248780487804878, - "grad_norm": 2.8139917850494385, - "learning_rate": 4.811527268394157e-06, - "loss": 0.4984, - "step": 256 - }, - { - "epoch": 1.2536585365853659, - "grad_norm": 2.849602699279785, - "learning_rate": 4.810065196959591e-06, - "loss": 0.553, - "step": 257 - }, - { - "epoch": 1.2585365853658537, - "grad_norm": 2.8745057582855225, - "learning_rate": 4.8085977003197496e-06, - "loss": 0.7955, - "step": 258 - }, - { - "epoch": 1.2634146341463415, - "grad_norm": 3.4053122997283936, - "learning_rate": 4.807124781921059e-06, - "loss": 0.9715, - "step": 259 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 3.1741702556610107, - "learning_rate": 4.805646445222679e-06, - "loss": 0.6306, - "step": 260 - }, - { - "epoch": 1.273170731707317, - "grad_norm": 2.5348331928253174, - "learning_rate": 4.804162693696494e-06, - "loss": 0.5192, - "step": 261 - }, - { - "epoch": 1.278048780487805, - "grad_norm": 3.2491304874420166, - "learning_rate": 4.802673530827105e-06, - "loss": 0.5369, - "step": 262 - }, - { - "epoch": 1.2829268292682927, - "grad_norm": 2.670273780822754, - "learning_rate": 4.801178960111823e-06, - "loss": 0.5864, - "step": 263 - }, - { - "epoch": 1.2878048780487805, - "grad_norm": 2.5655579566955566, - "learning_rate": 4.799678985060658e-06, - "loss": 0.7864, - "step": 264 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 2.6352531909942627, - "learning_rate": 4.798173609196314e-06, - "loss": 0.8198, - "step": 265 - }, - { - "epoch": 1.2975609756097561, - "grad_norm": 3.028343677520752, - "learning_rate": 4.796662836054176e-06, - "loss": 0.4621, - "step": 266 - }, - { - "epoch": 1.302439024390244, - "grad_norm": 2.757690191268921, - "learning_rate": 4.795146669182304e-06, - "loss": 0.6237, - "step": 267 - }, - { - "epoch": 1.3073170731707318, - "grad_norm": 2.564842462539673, - "learning_rate": 4.793625112141431e-06, - "loss": 0.4981, - "step": 268 - }, - { - "epoch": 1.3121951219512196, - "grad_norm": 2.69234299659729, - "learning_rate": 4.792098168504943e-06, - "loss": 0.5384, - "step": 269 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 2.794144868850708, - "learning_rate": 4.790565841858879e-06, - "loss": 0.5535, - "step": 270 - }, - { - "epoch": 1.3219512195121952, - "grad_norm": 2.850296974182129, - "learning_rate": 4.789028135801919e-06, - "loss": 0.7492, - "step": 271 - }, - { - "epoch": 1.326829268292683, - "grad_norm": 3.287806987762451, - "learning_rate": 4.787485053945377e-06, - "loss": 0.8367, - "step": 272 - }, - { - "epoch": 1.3317073170731708, - "grad_norm": 2.479343891143799, - "learning_rate": 4.785936599913193e-06, - "loss": 0.6875, - "step": 273 - }, - { - "epoch": 1.3365853658536586, - "grad_norm": 3.171198844909668, - "learning_rate": 4.784382777341922e-06, - "loss": 0.733, - "step": 274 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.866610050201416, - "learning_rate": 4.782823589880729e-06, - "loss": 0.9719, - "step": 275 - }, - { - "epoch": 1.346341463414634, - "grad_norm": 2.3714404106140137, - "learning_rate": 4.7812590411913755e-06, - "loss": 0.6979, - "step": 276 - }, - { - "epoch": 1.3512195121951218, - "grad_norm": 2.3838706016540527, - "learning_rate": 4.779689134948217e-06, - "loss": 0.9697, - "step": 277 - }, - { - "epoch": 1.3560975609756096, - "grad_norm": 3.2992005348205566, - "learning_rate": 4.77811387483819e-06, - "loss": 0.4799, - "step": 278 - }, - { - "epoch": 1.3609756097560974, - "grad_norm": 3.403024435043335, - "learning_rate": 4.776533264560804e-06, - "loss": 0.7478, - "step": 279 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 2.669820785522461, - "learning_rate": 4.774947307828134e-06, - "loss": 0.8622, - "step": 280 - }, - { - "epoch": 1.370731707317073, - "grad_norm": 2.4695041179656982, - "learning_rate": 4.773356008364812e-06, - "loss": 0.5792, - "step": 281 - }, - { - "epoch": 1.3756097560975609, - "grad_norm": 3.1744325160980225, - "learning_rate": 4.771759369908017e-06, - "loss": 0.4368, - "step": 282 - }, - { - "epoch": 1.3804878048780487, - "grad_norm": 2.8564929962158203, - "learning_rate": 4.7701573962074635e-06, - "loss": 0.6337, - "step": 283 - }, - { - "epoch": 1.3853658536585365, - "grad_norm": 2.4109890460968018, - "learning_rate": 4.7685500910254015e-06, - "loss": 0.5042, - "step": 284 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 2.389765977859497, - "learning_rate": 4.766937458136598e-06, - "loss": 0.7427, - "step": 285 - }, - { - "epoch": 1.395121951219512, - "grad_norm": 2.412153720855713, - "learning_rate": 4.765319501328332e-06, - "loss": 0.6956, - "step": 286 - }, - { - "epoch": 1.4, - "grad_norm": 2.6756227016448975, - "learning_rate": 4.763696224400391e-06, - "loss": 0.5152, - "step": 287 - }, - { - "epoch": 1.4048780487804877, - "grad_norm": 2.4644389152526855, - "learning_rate": 4.762067631165049e-06, - "loss": 0.5583, - "step": 288 - }, - { - "epoch": 1.4097560975609755, - "grad_norm": 2.6496896743774414, - "learning_rate": 4.760433725447071e-06, - "loss": 0.6824, - "step": 289 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 2.9843268394470215, - "learning_rate": 4.758794511083697e-06, - "loss": 0.7914, - "step": 290 - }, - { - "epoch": 1.4195121951219511, - "grad_norm": 3.639101266860962, - "learning_rate": 4.757149991924633e-06, - "loss": 0.6827, - "step": 291 - }, - { - "epoch": 1.424390243902439, - "grad_norm": 3.2047319412231445, - "learning_rate": 4.755500171832045e-06, - "loss": 0.5908, - "step": 292 - }, - { - "epoch": 1.4292682926829268, - "grad_norm": 2.463202953338623, - "learning_rate": 4.753845054680548e-06, - "loss": 0.6469, - "step": 293 - }, - { - "epoch": 1.4341463414634146, - "grad_norm": 2.711195945739746, - "learning_rate": 4.752184644357197e-06, - "loss": 0.5412, - "step": 294 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 2.239082098007202, - "learning_rate": 4.750518944761477e-06, - "loss": 0.5324, - "step": 295 - }, - { - "epoch": 1.4439024390243902, - "grad_norm": 2.711050271987915, - "learning_rate": 4.748847959805297e-06, - "loss": 0.5317, - "step": 296 - }, - { - "epoch": 1.448780487804878, - "grad_norm": 2.4389946460723877, - "learning_rate": 4.7471716934129774e-06, - "loss": 0.5199, - "step": 297 - }, - { - "epoch": 1.4536585365853658, - "grad_norm": 2.6532390117645264, - "learning_rate": 4.745490149521242e-06, - "loss": 0.4874, - "step": 298 - }, - { - "epoch": 1.4585365853658536, - "grad_norm": 2.2970616817474365, - "learning_rate": 4.743803332079209e-06, - "loss": 0.5416, - "step": 299 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 2.4206762313842773, - "learning_rate": 4.742111245048382e-06, - "loss": 0.5628, - "step": 300 - }, - { - "epoch": 1.4682926829268292, - "grad_norm": 2.7086844444274902, - "learning_rate": 4.740413892402639e-06, - "loss": 0.5847, - "step": 301 - }, - { - "epoch": 1.473170731707317, - "grad_norm": 2.848602771759033, - "learning_rate": 4.738711278128228e-06, - "loss": 0.5889, - "step": 302 - }, - { - "epoch": 1.4780487804878049, - "grad_norm": 3.5257909297943115, - "learning_rate": 4.7370034062237476e-06, - "loss": 0.3917, - "step": 303 - }, - { - "epoch": 1.4829268292682927, - "grad_norm": 6.47664213180542, - "learning_rate": 4.73529028070015e-06, - "loss": 0.5592, - "step": 304 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 2.8833930492401123, - "learning_rate": 4.733571905580723e-06, - "loss": 0.843, - "step": 305 - }, - { - "epoch": 1.4926829268292683, - "grad_norm": 2.9924156665802, - "learning_rate": 4.731848284901082e-06, - "loss": 0.7041, - "step": 306 - }, - { - "epoch": 1.497560975609756, - "grad_norm": 2.9858405590057373, - "learning_rate": 4.730119422709165e-06, - "loss": 0.4914, - "step": 307 - }, - { - "epoch": 1.502439024390244, - "grad_norm": 3.4032366275787354, - "learning_rate": 4.728385323065215e-06, - "loss": 0.644, - "step": 308 - }, - { - "epoch": 1.5073170731707317, - "grad_norm": 2.86360502243042, - "learning_rate": 4.7266459900417815e-06, - "loss": 0.5335, - "step": 309 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 3.183012008666992, - "learning_rate": 4.724901427723698e-06, - "loss": 0.8275, - "step": 310 - }, - { - "epoch": 1.5170731707317073, - "grad_norm": 3.4128706455230713, - "learning_rate": 4.723151640208084e-06, - "loss": 0.4091, - "step": 311 - }, - { - "epoch": 1.5219512195121951, - "grad_norm": 2.765897512435913, - "learning_rate": 4.721396631604327e-06, - "loss": 0.4414, - "step": 312 - }, - { - "epoch": 1.526829268292683, - "grad_norm": 3.2348268032073975, - "learning_rate": 4.7196364060340785e-06, - "loss": 0.5423, - "step": 313 - }, - { - "epoch": 1.5317073170731708, - "grad_norm": 2.7270045280456543, - "learning_rate": 4.7178709676312416e-06, - "loss": 0.8072, - "step": 314 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 2.525298833847046, - "learning_rate": 4.716100320541961e-06, - "loss": 1.0254, - "step": 315 - }, - { - "epoch": 1.5414634146341464, - "grad_norm": 2.371321678161621, - "learning_rate": 4.714324468924614e-06, - "loss": 0.6541, - "step": 316 - }, - { - "epoch": 1.5463414634146342, - "grad_norm": 3.0820438861846924, - "learning_rate": 4.712543416949803e-06, - "loss": 0.7519, - "step": 317 - }, - { - "epoch": 1.551219512195122, - "grad_norm": 2.710369348526001, - "learning_rate": 4.71075716880034e-06, - "loss": 0.7232, - "step": 318 - }, - { - "epoch": 1.5560975609756098, - "grad_norm": 2.4568352699279785, - "learning_rate": 4.708965728671243e-06, - "loss": 0.8059, - "step": 319 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 2.7511191368103027, - "learning_rate": 4.7071691007697214e-06, - "loss": 0.6579, - "step": 320 - }, - { - "epoch": 1.5658536585365854, - "grad_norm": 2.6519858837127686, - "learning_rate": 4.705367289315172e-06, - "loss": 0.6989, - "step": 321 - }, - { - "epoch": 1.5707317073170732, - "grad_norm": 2.763019323348999, - "learning_rate": 4.703560298539158e-06, - "loss": 0.4916, - "step": 322 - }, - { - "epoch": 1.575609756097561, - "grad_norm": 2.6480252742767334, - "learning_rate": 4.701748132685415e-06, - "loss": 0.5076, - "step": 323 - }, - { - "epoch": 1.5804878048780489, - "grad_norm": 2.4289543628692627, - "learning_rate": 4.699930796009825e-06, - "loss": 0.559, - "step": 324 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 4.0515899658203125, - "learning_rate": 4.698108292780418e-06, - "loss": 0.7388, - "step": 325 - }, - { - "epoch": 1.5902439024390245, - "grad_norm": 2.5959129333496094, - "learning_rate": 4.696280627277356e-06, - "loss": 0.5469, - "step": 326 - }, - { - "epoch": 1.5951219512195123, - "grad_norm": 2.3453526496887207, - "learning_rate": 4.6944478037929255e-06, - "loss": 0.5494, - "step": 327 - }, - { - "epoch": 1.6, - "grad_norm": 3.7527170181274414, - "learning_rate": 4.692609826631525e-06, - "loss": 0.7536, - "step": 328 - }, - { - "epoch": 1.604878048780488, - "grad_norm": 3.423588275909424, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4586, - "step": 329 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.620429754257202, - "learning_rate": 4.6889184285559234e-06, - "loss": 0.4799, - "step": 330 - }, - { - "epoch": 1.6146341463414635, - "grad_norm": 6.416718006134033, - "learning_rate": 4.687065016310996e-06, - "loss": 0.7502, - "step": 331 - }, - { - "epoch": 1.6195121951219513, - "grad_norm": 2.7324717044830322, - "learning_rate": 4.685206467727631e-06, - "loss": 0.5923, - "step": 332 - }, - { - "epoch": 1.6243902439024391, - "grad_norm": 2.582935333251953, - "learning_rate": 4.683342787170644e-06, - "loss": 0.5619, - "step": 333 - }, - { - "epoch": 1.629268292682927, - "grad_norm": 2.8339877128601074, - "learning_rate": 4.6814739790169006e-06, - "loss": 0.55, - "step": 334 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 2.733982563018799, - "learning_rate": 4.679600047655313e-06, - "loss": 0.7243, - "step": 335 - }, - { - "epoch": 1.6390243902439026, - "grad_norm": 3.192747116088867, - "learning_rate": 4.6777209974868194e-06, - "loss": 1.132, - "step": 336 - }, - { - "epoch": 1.6439024390243904, - "grad_norm": 2.5185582637786865, - "learning_rate": 4.675836832924387e-06, - "loss": 0.55, - "step": 337 - }, - { - "epoch": 1.6487804878048782, - "grad_norm": 2.7306225299835205, - "learning_rate": 4.673947558392989e-06, - "loss": 0.4418, - "step": 338 - }, - { - "epoch": 1.653658536585366, - "grad_norm": 2.7026166915893555, - "learning_rate": 4.6720531783296e-06, - "loss": 0.5897, - "step": 339 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 2.5981674194335938, - "learning_rate": 4.670153697183185e-06, - "loss": 0.5889, - "step": 340 - }, - { - "epoch": 1.6634146341463416, - "grad_norm": 3.0985405445098877, - "learning_rate": 4.668249119414692e-06, - "loss": 0.5607, - "step": 341 - }, - { - "epoch": 1.6682926829268294, - "grad_norm": 2.7609124183654785, - "learning_rate": 4.666339449497033e-06, - "loss": 0.6284, - "step": 342 - }, - { - "epoch": 1.6731707317073172, - "grad_norm": 3.186077356338501, - "learning_rate": 4.664424691915084e-06, - "loss": 0.5751, - "step": 343 - }, - { - "epoch": 1.678048780487805, - "grad_norm": 3.644227981567383, - "learning_rate": 4.6625048511656675e-06, - "loss": 0.586, - "step": 344 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 3.196373462677002, - "learning_rate": 4.660579931757543e-06, - "loss": 0.5086, - "step": 345 - }, - { - "epoch": 1.6878048780487804, - "grad_norm": 2.7773900032043457, - "learning_rate": 4.6586499382113985e-06, - "loss": 0.5934, - "step": 346 - }, - { - "epoch": 1.6926829268292682, - "grad_norm": 2.3397631645202637, - "learning_rate": 4.6567148750598375e-06, - "loss": 0.7654, - "step": 347 - }, - { - "epoch": 1.697560975609756, - "grad_norm": 2.5567805767059326, - "learning_rate": 4.6547747468473705e-06, - "loss": 0.8908, - "step": 348 - }, - { - "epoch": 1.7024390243902439, - "grad_norm": 2.9218900203704834, - "learning_rate": 4.652829558130404e-06, - "loss": 0.4383, - "step": 349 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.962965250015259, - "learning_rate": 4.6508793134772265e-06, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 1.7121951219512195, - "grad_norm": 2.487739324569702, - "learning_rate": 4.648924017468003e-06, - "loss": 0.533, - "step": 351 - }, - { - "epoch": 1.7170731707317073, - "grad_norm": 2.769474506378174, - "learning_rate": 4.646963674694761e-06, - "loss": 0.8125, - "step": 352 - }, - { - "epoch": 1.721951219512195, - "grad_norm": 2.678243398666382, - "learning_rate": 4.64499828976138e-06, - "loss": 0.386, - "step": 353 - }, - { - "epoch": 1.726829268292683, - "grad_norm": 3.2764477729797363, - "learning_rate": 4.64302786728358e-06, - "loss": 0.4792, - "step": 354 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.6092708110809326, - "learning_rate": 4.641052411888913e-06, - "loss": 0.5031, - "step": 355 - }, - { - "epoch": 1.7365853658536585, - "grad_norm": 3.4002952575683594, - "learning_rate": 4.6390719282167515e-06, - "loss": 0.4726, - "step": 356 - }, - { - "epoch": 1.7414634146341463, - "grad_norm": 2.7558157444000244, - "learning_rate": 4.637086420918276e-06, - "loss": 0.7794, - "step": 357 - }, - { - "epoch": 1.7463414634146341, - "grad_norm": 2.239021062850952, - "learning_rate": 4.635095894656465e-06, - "loss": 0.6202, - "step": 358 - }, - { - "epoch": 1.751219512195122, - "grad_norm": 2.0502119064331055, - "learning_rate": 4.633100354106085e-06, - "loss": 0.3743, - "step": 359 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 2.842203140258789, - "learning_rate": 4.631099803953677e-06, - "loss": 0.8143, - "step": 360 - }, - { - "epoch": 1.7609756097560976, - "grad_norm": 2.8408772945404053, - "learning_rate": 4.629094248897546e-06, - "loss": 0.4986, - "step": 361 - }, - { - "epoch": 1.7658536585365854, - "grad_norm": 2.755530595779419, - "learning_rate": 4.627083693647757e-06, - "loss": 0.5833, - "step": 362 - }, - { - "epoch": 1.7707317073170732, - "grad_norm": 2.717116355895996, - "learning_rate": 4.625068142926111e-06, - "loss": 0.885, - "step": 363 - }, - { - "epoch": 1.775609756097561, - "grad_norm": 2.2784435749053955, - "learning_rate": 4.623047601466144e-06, - "loss": 0.7351, - "step": 364 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 2.3133914470672607, - "learning_rate": 4.621022074013114e-06, - "loss": 0.6426, - "step": 365 - }, - { - "epoch": 1.7853658536585366, - "grad_norm": 3.13562273979187, - "learning_rate": 4.618991565323987e-06, - "loss": 0.5588, - "step": 366 - }, - { - "epoch": 1.7902439024390244, - "grad_norm": 2.458186388015747, - "learning_rate": 4.616956080167426e-06, - "loss": 0.5424, - "step": 367 - }, - { - "epoch": 1.7951219512195122, - "grad_norm": 2.4780080318450928, - "learning_rate": 4.614915623323786e-06, - "loss": 0.8664, - "step": 368 - }, - { - "epoch": 1.8, - "grad_norm": 2.623966932296753, - "learning_rate": 4.612870199585092e-06, - "loss": 0.4495, - "step": 369 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 2.7326242923736572, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5099, - "step": 370 - }, - { - "epoch": 1.8097560975609757, - "grad_norm": 2.951014757156372, - "learning_rate": 4.608764470648971e-06, - "loss": 0.4322, - "step": 371 - }, - { - "epoch": 1.8146341463414632, - "grad_norm": 2.869870185852051, - "learning_rate": 4.606704175093879e-06, - "loss": 0.4744, - "step": 372 - }, - { - "epoch": 1.819512195121951, - "grad_norm": 2.686054229736328, - "learning_rate": 4.604638931928383e-06, - "loss": 0.797, - "step": 373 - }, - { - "epoch": 1.8243902439024389, - "grad_norm": 2.6421749591827393, - "learning_rate": 4.602568746002718e-06, - "loss": 0.4904, - "step": 374 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 2.949144124984741, - "learning_rate": 4.600493622178734e-06, - "loss": 0.8682, - "step": 375 - }, - { - "epoch": 1.8341463414634145, - "grad_norm": 2.554733991622925, - "learning_rate": 4.598413565329876e-06, - "loss": 0.5426, - "step": 376 - }, - { - "epoch": 1.8390243902439023, - "grad_norm": 2.3334367275238037, - "learning_rate": 4.596328580341169e-06, - "loss": 0.5628, - "step": 377 - }, - { - "epoch": 1.84390243902439, - "grad_norm": 2.577664613723755, - "learning_rate": 4.5942386721092195e-06, - "loss": 0.7073, - "step": 378 - }, - { - "epoch": 1.848780487804878, - "grad_norm": 3.1247141361236572, - "learning_rate": 4.592143845542189e-06, - "loss": 0.6526, - "step": 379 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 2.7015256881713867, - "learning_rate": 4.590044105559797e-06, - "loss": 0.8377, - "step": 380 - }, - { - "epoch": 1.8585365853658535, - "grad_norm": 2.573819398880005, - "learning_rate": 4.587939457093296e-06, - "loss": 0.5485, - "step": 381 - }, - { - "epoch": 1.8634146341463413, - "grad_norm": 2.8607687950134277, - "learning_rate": 4.585829905085468e-06, - "loss": 0.6065, - "step": 382 - }, - { - "epoch": 1.8682926829268292, - "grad_norm": 2.526625394821167, - "learning_rate": 4.5837154544906135e-06, - "loss": 0.7812, - "step": 383 - }, - { - "epoch": 1.873170731707317, - "grad_norm": 2.4161314964294434, - "learning_rate": 4.581596110274535e-06, - "loss": 0.7061, - "step": 384 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 2.34195876121521, - "learning_rate": 4.579471877414527e-06, - "loss": 0.9446, - "step": 385 - }, - { - "epoch": 1.8829268292682926, - "grad_norm": 3.7710156440734863, - "learning_rate": 4.577342760899368e-06, - "loss": 0.78, - "step": 386 - }, - { - "epoch": 1.8878048780487804, - "grad_norm": 2.5192313194274902, - "learning_rate": 4.575208765729302e-06, - "loss": 0.5205, - "step": 387 - }, - { - "epoch": 1.8926829268292682, - "grad_norm": 2.467484951019287, - "learning_rate": 4.573069896916035e-06, - "loss": 0.7827, - "step": 388 - }, - { - "epoch": 1.897560975609756, - "grad_norm": 2.640676259994507, - "learning_rate": 4.5709261594827125e-06, - "loss": 0.6512, - "step": 389 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 2.976623296737671, - "learning_rate": 4.568777558463922e-06, - "loss": 0.5548, - "step": 390 - }, - { - "epoch": 1.9073170731707316, - "grad_norm": 2.289722442626953, - "learning_rate": 4.566624098905665e-06, - "loss": 0.7038, - "step": 391 - }, - { - "epoch": 1.9121951219512194, - "grad_norm": 2.9512040615081787, - "learning_rate": 4.564465785865359e-06, - "loss": 0.5416, - "step": 392 - }, - { - "epoch": 1.9170731707317072, - "grad_norm": 2.394874095916748, - "learning_rate": 4.56230262441182e-06, - "loss": 0.4068, - "step": 393 - }, - { - "epoch": 1.921951219512195, - "grad_norm": 6.885486602783203, - "learning_rate": 4.560134619625247e-06, - "loss": 0.6197, - "step": 394 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 2.311272144317627, - "learning_rate": 4.5579617765972155e-06, - "loss": 0.5692, - "step": 395 - }, - { - "epoch": 1.9317073170731707, - "grad_norm": 2.4662933349609375, - "learning_rate": 4.555784100430662e-06, - "loss": 0.4836, - "step": 396 - }, - { - "epoch": 1.9365853658536585, - "grad_norm": 2.602741241455078, - "learning_rate": 4.553601596239877e-06, - "loss": 0.4594, - "step": 397 - }, - { - "epoch": 1.9414634146341463, - "grad_norm": 3.443909168243408, - "learning_rate": 4.551414269150489e-06, - "loss": 0.6053, - "step": 398 - }, - { - "epoch": 1.946341463414634, - "grad_norm": 2.5391502380371094, - "learning_rate": 4.54922212429945e-06, - "loss": 0.5133, - "step": 399 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 2.7105700969696045, - "learning_rate": 4.547025166835027e-06, - "loss": 0.6984, - "step": 400 - }, - { - "epoch": 1.9560975609756097, - "grad_norm": 2.6098098754882812, - "learning_rate": 4.544823401916794e-06, - "loss": 0.7944, - "step": 401 - }, - { - "epoch": 1.9609756097560975, - "grad_norm": 2.7527425289154053, - "learning_rate": 4.542616834715612e-06, - "loss": 0.639, - "step": 402 - }, - { - "epoch": 1.9658536585365853, - "grad_norm": 2.760303258895874, - "learning_rate": 4.540405470413618e-06, - "loss": 0.4229, - "step": 403 - }, - { - "epoch": 1.9707317073170731, - "grad_norm": 2.4989006519317627, - "learning_rate": 4.53818931420422e-06, - "loss": 0.7482, - "step": 404 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 2.3687169551849365, - "learning_rate": 4.535968371292076e-06, - "loss": 0.6146, - "step": 405 - }, - { - "epoch": 1.9804878048780488, - "grad_norm": 2.4285244941711426, - "learning_rate": 4.533742646893086e-06, - "loss": 0.6964, - "step": 406 - }, - { - "epoch": 1.9853658536585366, - "grad_norm": 2.337266206741333, - "learning_rate": 4.531512146234383e-06, - "loss": 0.6248, - "step": 407 - }, - { - "epoch": 1.9902439024390244, - "grad_norm": 2.704972743988037, - "learning_rate": 4.529276874554312e-06, - "loss": 0.8715, - "step": 408 - }, - { - "epoch": 1.9951219512195122, - "grad_norm": 2.2151944637298584, - "learning_rate": 4.527036837102426e-06, - "loss": 0.4945, - "step": 409 - }, - { - "epoch": 2.0, - "grad_norm": 2.691330671310425, - "learning_rate": 4.524792039139471e-06, - "loss": 0.7085, - "step": 410 - }, - { - "epoch": 2.004878048780488, - "grad_norm": 2.9423086643218994, - "learning_rate": 4.522542485937369e-06, - "loss": 0.3178, - "step": 411 - }, - { - "epoch": 2.0097560975609756, - "grad_norm": 2.860677719116211, - "learning_rate": 4.520288182779214e-06, - "loss": 0.5092, - "step": 412 - }, - { - "epoch": 2.0146341463414634, - "grad_norm": 2.7503843307495117, - "learning_rate": 4.518029134959253e-06, - "loss": 0.314, - "step": 413 - }, - { - "epoch": 2.0195121951219512, - "grad_norm": 4.541809558868408, - "learning_rate": 4.515765347782878e-06, - "loss": 0.5287, - "step": 414 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 9.126826286315918, - "learning_rate": 4.5134968265666085e-06, - "loss": 0.8221, - "step": 415 - }, - { - "epoch": 2.029268292682927, - "grad_norm": 4.4358229637146, - "learning_rate": 4.511223576638084e-06, - "loss": 0.5402, - "step": 416 - }, - { - "epoch": 2.0341463414634147, - "grad_norm": 3.1090731620788574, - "learning_rate": 4.508945603336049e-06, - "loss": 0.617, - "step": 417 - }, - { - "epoch": 2.0390243902439025, - "grad_norm": 2.6933369636535645, - "learning_rate": 4.50666291201034e-06, - "loss": 0.3541, - "step": 418 - }, - { - "epoch": 2.0439024390243903, - "grad_norm": 5.898099899291992, - "learning_rate": 4.504375508021876e-06, - "loss": 0.4842, - "step": 419 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 2.950939178466797, - "learning_rate": 4.50208339674264e-06, - "loss": 0.6168, - "step": 420 - }, - { - "epoch": 2.053658536585366, - "grad_norm": 3.2513322830200195, - "learning_rate": 4.499786583555675e-06, - "loss": 0.6425, - "step": 421 - }, - { - "epoch": 2.0585365853658537, - "grad_norm": 2.911562442779541, - "learning_rate": 4.497485073855061e-06, - "loss": 0.364, - "step": 422 - }, - { - "epoch": 2.0634146341463415, - "grad_norm": 4.2179274559021, - "learning_rate": 4.495178873045913e-06, - "loss": 0.3687, - "step": 423 - }, - { - "epoch": 2.0682926829268293, - "grad_norm": 3.2010395526885986, - "learning_rate": 4.4928679865443605e-06, - "loss": 0.4068, - "step": 424 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 3.2425589561462402, - "learning_rate": 4.4905524197775366e-06, - "loss": 0.4759, - "step": 425 - }, - { - "epoch": 2.078048780487805, - "grad_norm": 2.9252519607543945, - "learning_rate": 4.4882321781835666e-06, - "loss": 0.4197, - "step": 426 - }, - { - "epoch": 2.0829268292682928, - "grad_norm": 2.7859911918640137, - "learning_rate": 4.4859072672115565e-06, - "loss": 0.2294, - "step": 427 - }, - { - "epoch": 2.0878048780487806, - "grad_norm": 3.138796091079712, - "learning_rate": 4.483577692321577e-06, - "loss": 0.7572, - "step": 428 - }, - { - "epoch": 2.0926829268292684, - "grad_norm": 3.1447339057922363, - "learning_rate": 4.481243458984651e-06, - "loss": 0.4035, - "step": 429 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 3.1876862049102783, - "learning_rate": 4.478904572682743e-06, - "loss": 0.5776, - "step": 430 - }, - { - "epoch": 2.102439024390244, - "grad_norm": 2.934257745742798, - "learning_rate": 4.476561038908745e-06, - "loss": 0.4005, - "step": 431 - }, - { - "epoch": 2.107317073170732, - "grad_norm": 2.904954433441162, - "learning_rate": 4.474212863166464e-06, - "loss": 0.5689, - "step": 432 - }, - { - "epoch": 2.1121951219512196, - "grad_norm": 3.6023731231689453, - "learning_rate": 4.471860050970608e-06, - "loss": 0.5068, - "step": 433 - }, - { - "epoch": 2.1170731707317074, - "grad_norm": 4.073422431945801, - "learning_rate": 4.469502607846774e-06, - "loss": 0.8349, - "step": 434 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 2.813789129257202, - "learning_rate": 4.467140539331434e-06, - "loss": 0.3641, - "step": 435 - }, - { - "epoch": 2.126829268292683, - "grad_norm": 3.874516248703003, - "learning_rate": 4.464773850971924e-06, - "loss": 0.222, - "step": 436 - }, - { - "epoch": 2.131707317073171, - "grad_norm": 3.1221084594726562, - "learning_rate": 4.46240254832643e-06, - "loss": 0.3799, - "step": 437 - }, - { - "epoch": 2.1365853658536587, - "grad_norm": 3.298933267593384, - "learning_rate": 4.460026636963971e-06, - "loss": 0.4759, - "step": 438 - }, - { - "epoch": 2.1414634146341465, - "grad_norm": 2.456233024597168, - "learning_rate": 4.4576461224643965e-06, - "loss": 0.384, - "step": 439 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 2.8427460193634033, - "learning_rate": 4.455261010418359e-06, - "loss": 0.391, - "step": 440 - }, - { - "epoch": 2.151219512195122, - "grad_norm": 3.0267624855041504, - "learning_rate": 4.452871306427314e-06, - "loss": 0.6177, - "step": 441 - }, - { - "epoch": 2.15609756097561, - "grad_norm": 3.437302827835083, - "learning_rate": 4.450477016103498e-06, - "loss": 0.5143, - "step": 442 - }, - { - "epoch": 2.1609756097560977, - "grad_norm": 3.152210235595703, - "learning_rate": 4.4480781450699205e-06, - "loss": 0.3783, - "step": 443 - }, - { - "epoch": 2.1658536585365855, - "grad_norm": 3.507753372192383, - "learning_rate": 4.4456746989603464e-06, - "loss": 0.3574, - "step": 444 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 2.8855366706848145, - "learning_rate": 4.443266683419289e-06, - "loss": 0.5088, - "step": 445 - }, - { - "epoch": 2.175609756097561, - "grad_norm": 2.7776072025299072, - "learning_rate": 4.440854104101988e-06, - "loss": 0.3773, - "step": 446 - }, - { - "epoch": 2.180487804878049, - "grad_norm": 3.019484281539917, - "learning_rate": 4.438436966674406e-06, - "loss": 0.5002, - "step": 447 - }, - { - "epoch": 2.1853658536585368, - "grad_norm": 3.6962451934814453, - "learning_rate": 4.436015276813208e-06, - "loss": 0.4601, - "step": 448 - }, - { - "epoch": 2.1902439024390246, - "grad_norm": 3.1288888454437256, - "learning_rate": 4.4335890402057505e-06, - "loss": 0.5422, - "step": 449 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 3.7083234786987305, - "learning_rate": 4.431158262550067e-06, - "loss": 0.4684, - "step": 450 - }, - { - "epoch": 2.2, - "grad_norm": 3.1714789867401123, - "learning_rate": 4.428722949554858e-06, - "loss": 0.2528, - "step": 451 - }, - { - "epoch": 2.204878048780488, - "grad_norm": 3.0773637294769287, - "learning_rate": 4.426283106939474e-06, - "loss": 0.4061, - "step": 452 - }, - { - "epoch": 2.209756097560976, - "grad_norm": 2.604093551635742, - "learning_rate": 4.423838740433903e-06, - "loss": 0.4779, - "step": 453 - }, - { - "epoch": 2.2146341463414636, - "grad_norm": 2.9293880462646484, - "learning_rate": 4.4213898557787586e-06, - "loss": 0.233, - "step": 454 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 2.9195125102996826, - "learning_rate": 4.4189364587252636e-06, - "loss": 0.7756, - "step": 455 - }, - { - "epoch": 2.2243902439024392, - "grad_norm": 3.2263920307159424, - "learning_rate": 4.416478555035241e-06, - "loss": 0.2806, - "step": 456 - }, - { - "epoch": 2.229268292682927, - "grad_norm": 2.8109211921691895, - "learning_rate": 4.4140161504810935e-06, - "loss": 0.3923, - "step": 457 - }, - { - "epoch": 2.234146341463415, - "grad_norm": 2.645853281021118, - "learning_rate": 4.4115492508457986e-06, - "loss": 0.289, - "step": 458 - }, - { - "epoch": 2.2390243902439027, - "grad_norm": 3.3712451457977295, - "learning_rate": 4.409077861922887e-06, - "loss": 0.5053, - "step": 459 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 2.6892387866973877, - "learning_rate": 4.406601989516435e-06, - "loss": 0.3363, - "step": 460 - }, - { - "epoch": 2.2487804878048783, - "grad_norm": 2.3195693492889404, - "learning_rate": 4.404121639441047e-06, - "loss": 0.2367, - "step": 461 - }, - { - "epoch": 2.253658536585366, - "grad_norm": 3.0115339756011963, - "learning_rate": 4.401636817521843e-06, - "loss": 0.4942, - "step": 462 - }, - { - "epoch": 2.258536585365854, - "grad_norm": 2.9528865814208984, - "learning_rate": 4.399147529594447e-06, - "loss": 0.3328, - "step": 463 - }, - { - "epoch": 2.2634146341463417, - "grad_norm": 3.110799551010132, - "learning_rate": 4.3966537815049686e-06, - "loss": 0.3917, - "step": 464 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 3.2973792552948, - "learning_rate": 4.394155579109994e-06, - "loss": 0.5203, - "step": 465 - }, - { - "epoch": 2.2731707317073173, - "grad_norm": 4.7184038162231445, - "learning_rate": 4.391652928276572e-06, - "loss": 0.729, - "step": 466 - }, - { - "epoch": 2.278048780487805, - "grad_norm": 3.1992053985595703, - "learning_rate": 4.389145834882195e-06, - "loss": 0.4822, - "step": 467 - }, - { - "epoch": 2.2829268292682925, - "grad_norm": 4.320055961608887, - "learning_rate": 4.386634304814789e-06, - "loss": 0.3962, - "step": 468 - }, - { - "epoch": 2.2878048780487803, - "grad_norm": 3.704524517059326, - "learning_rate": 4.384118343972704e-06, - "loss": 0.5996, - "step": 469 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 2.8172974586486816, - "learning_rate": 4.381597958264692e-06, - "loss": 0.6328, - "step": 470 - }, - { - "epoch": 2.297560975609756, - "grad_norm": 2.7418763637542725, - "learning_rate": 4.379073153609896e-06, - "loss": 0.6254, - "step": 471 - }, - { - "epoch": 2.3024390243902437, - "grad_norm": 5.364504337310791, - "learning_rate": 4.37654393593784e-06, - "loss": 0.6793, - "step": 472 - }, - { - "epoch": 2.3073170731707315, - "grad_norm": 2.935291290283203, - "learning_rate": 4.3740103111884096e-06, - "loss": 0.4161, - "step": 473 - }, - { - "epoch": 2.3121951219512193, - "grad_norm": 3.085155963897705, - "learning_rate": 4.371472285311842e-06, - "loss": 0.3329, - "step": 474 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 2.2218778133392334, - "learning_rate": 4.368929864268709e-06, - "loss": 0.2687, - "step": 475 - }, - { - "epoch": 2.321951219512195, - "grad_norm": 3.3985276222229004, - "learning_rate": 4.366383054029907e-06, - "loss": 0.5934, - "step": 476 - }, - { - "epoch": 2.3268292682926828, - "grad_norm": 3.0726048946380615, - "learning_rate": 4.363831860576638e-06, - "loss": 0.5033, - "step": 477 - }, - { - "epoch": 2.3317073170731706, - "grad_norm": 2.728628635406494, - "learning_rate": 4.361276289900396e-06, - "loss": 0.4492, - "step": 478 - }, - { - "epoch": 2.3365853658536584, - "grad_norm": 3.1294424533843994, - "learning_rate": 4.358716348002962e-06, - "loss": 0.619, - "step": 479 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 3.5564961433410645, - "learning_rate": 4.356152040896376e-06, - "loss": 0.4018, - "step": 480 - }, - { - "epoch": 2.346341463414634, - "grad_norm": 2.9329910278320312, - "learning_rate": 4.3535833746029335e-06, - "loss": 0.3062, - "step": 481 - }, - { - "epoch": 2.351219512195122, - "grad_norm": 3.744480848312378, - "learning_rate": 4.351010355155165e-06, - "loss": 0.3387, - "step": 482 - }, - { - "epoch": 2.3560975609756096, - "grad_norm": 2.537912130355835, - "learning_rate": 4.348432988595828e-06, - "loss": 0.3103, - "step": 483 - }, - { - "epoch": 2.3609756097560974, - "grad_norm": 3.232128858566284, - "learning_rate": 4.345851280977885e-06, - "loss": 0.6782, - "step": 484 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 3.601463794708252, - "learning_rate": 4.343265238364496e-06, - "loss": 0.3195, - "step": 485 - }, - { - "epoch": 2.370731707317073, - "grad_norm": 4.05529260635376, - "learning_rate": 4.340674866829001e-06, - "loss": 0.4639, - "step": 486 - }, - { - "epoch": 2.375609756097561, - "grad_norm": 4.128161430358887, - "learning_rate": 4.338080172454908e-06, - "loss": 0.7229, - "step": 487 - }, - { - "epoch": 2.3804878048780487, - "grad_norm": 2.665430784225464, - "learning_rate": 4.335481161335875e-06, - "loss": 0.4334, - "step": 488 - }, - { - "epoch": 2.3853658536585365, - "grad_norm": 3.777899742126465, - "learning_rate": 4.332877839575699e-06, - "loss": 0.3409, - "step": 489 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 2.9942116737365723, - "learning_rate": 4.330270213288301e-06, - "loss": 0.5221, - "step": 490 - }, - { - "epoch": 2.395121951219512, - "grad_norm": 3.518601417541504, - "learning_rate": 4.32765828859771e-06, - "loss": 0.7078, - "step": 491 - }, - { - "epoch": 2.4, - "grad_norm": 3.452350378036499, - "learning_rate": 4.325042071638051e-06, - "loss": 0.5902, - "step": 492 - }, - { - "epoch": 2.4048780487804877, - "grad_norm": 3.072655200958252, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3746, - "step": 493 - }, - { - "epoch": 2.4097560975609755, - "grad_norm": 2.8621394634246826, - "learning_rate": 4.319796785498416e-06, - "loss": 0.3474, - "step": 494 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 3.3891537189483643, - "learning_rate": 4.317167728637032e-06, - "loss": 0.5171, - "step": 495 - }, - { - "epoch": 2.419512195121951, - "grad_norm": 2.505720376968384, - "learning_rate": 4.314534404143738e-06, - "loss": 0.4263, - "step": 496 - }, - { - "epoch": 2.424390243902439, - "grad_norm": 2.6280455589294434, - "learning_rate": 4.3118968182029155e-06, - "loss": 0.5072, - "step": 497 - }, - { - "epoch": 2.4292682926829268, - "grad_norm": 2.703711748123169, - "learning_rate": 4.3092549770089566e-06, - "loss": 0.2742, - "step": 498 - }, - { - "epoch": 2.4341463414634146, - "grad_norm": 3.0358169078826904, - "learning_rate": 4.306608886766243e-06, - "loss": 0.4814, - "step": 499 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 3.263326406478882, - "learning_rate": 4.303958553689137e-06, - "loss": 0.4188, - "step": 500 - }, - { - "epoch": 2.44390243902439, - "grad_norm": 2.833951950073242, - "learning_rate": 4.3013039840019675e-06, - "loss": 0.6436, - "step": 501 - }, - { - "epoch": 2.448780487804878, - "grad_norm": 3.6790921688079834, - "learning_rate": 4.2986451839390105e-06, - "loss": 0.2862, - "step": 502 - }, - { - "epoch": 2.453658536585366, - "grad_norm": 2.7376418113708496, - "learning_rate": 4.295982159744476e-06, - "loss": 0.4926, - "step": 503 - }, - { - "epoch": 2.4585365853658536, - "grad_norm": 3.575244665145874, - "learning_rate": 4.293314917672498e-06, - "loss": 0.5717, - "step": 504 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 2.8722269535064697, - "learning_rate": 4.290643463987114e-06, - "loss": 0.2707, - "step": 505 - }, - { - "epoch": 2.4682926829268292, - "grad_norm": 2.8118090629577637, - "learning_rate": 4.287967804962252e-06, - "loss": 0.347, - "step": 506 - }, - { - "epoch": 2.473170731707317, - "grad_norm": 3.345698356628418, - "learning_rate": 4.285287946881718e-06, - "loss": 0.2103, - "step": 507 - }, - { - "epoch": 2.478048780487805, - "grad_norm": 3.0156590938568115, - "learning_rate": 4.282603896039178e-06, - "loss": 0.6405, - "step": 508 - }, - { - "epoch": 2.4829268292682927, - "grad_norm": 3.102205753326416, - "learning_rate": 4.279915658738145e-06, - "loss": 0.4027, - "step": 509 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 2.8665261268615723, - "learning_rate": 4.277223241291966e-06, - "loss": 0.6503, - "step": 510 - }, - { - "epoch": 2.4926829268292683, - "grad_norm": 2.5396728515625, - "learning_rate": 4.274526650023801e-06, - "loss": 0.5006, - "step": 511 - }, - { - "epoch": 2.497560975609756, - "grad_norm": 3.4846577644348145, - "learning_rate": 4.271825891266617e-06, - "loss": 0.479, - "step": 512 - }, - { - "epoch": 2.502439024390244, - "grad_norm": 4.5995612144470215, - "learning_rate": 4.269120971363164e-06, - "loss": 0.6667, - "step": 513 - }, - { - "epoch": 2.5073170731707317, - "grad_norm": 3.2117559909820557, - "learning_rate": 4.266411896665967e-06, - "loss": 0.2977, - "step": 514 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 2.798161268234253, - "learning_rate": 4.263698673537309e-06, - "loss": 0.3912, - "step": 515 - }, - { - "epoch": 2.5170731707317073, - "grad_norm": 3.593287944793701, - "learning_rate": 4.260981308349214e-06, - "loss": 0.615, - "step": 516 - }, - { - "epoch": 2.521951219512195, - "grad_norm": 3.06075119972229, - "learning_rate": 4.258259807483434e-06, - "loss": 0.4559, - "step": 517 - }, - { - "epoch": 2.526829268292683, - "grad_norm": 2.893202543258667, - "learning_rate": 4.255534177331435e-06, - "loss": 0.4993, - "step": 518 - }, - { - "epoch": 2.5317073170731708, - "grad_norm": 3.613308906555176, - "learning_rate": 4.252804424294378e-06, - "loss": 0.4581, - "step": 519 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 3.1191842555999756, - "learning_rate": 4.25007055478311e-06, - "loss": 0.5403, - "step": 520 - }, - { - "epoch": 2.5414634146341464, - "grad_norm": 3.653355836868286, - "learning_rate": 4.247332575218144e-06, - "loss": 0.3658, - "step": 521 - }, - { - "epoch": 2.546341463414634, - "grad_norm": 3.1386306285858154, - "learning_rate": 4.244590492029643e-06, - "loss": 0.6342, - "step": 522 - }, - { - "epoch": 2.551219512195122, - "grad_norm": 3.0894742012023926, - "learning_rate": 4.241844311657411e-06, - "loss": 0.3411, - "step": 523 - }, - { - "epoch": 2.55609756097561, - "grad_norm": 3.205916404724121, - "learning_rate": 4.239094040550875e-06, - "loss": 0.2829, - "step": 524 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 2.378857374191284, - "learning_rate": 4.236339685169065e-06, - "loss": 0.4749, - "step": 525 - }, - { - "epoch": 2.5658536585365854, - "grad_norm": 3.8657875061035156, - "learning_rate": 4.233581251980604e-06, - "loss": 0.2485, - "step": 526 - }, - { - "epoch": 2.5707317073170732, - "grad_norm": 3.565807580947876, - "learning_rate": 4.230818747463696e-06, - "loss": 0.4488, - "step": 527 - }, - { - "epoch": 2.575609756097561, - "grad_norm": 2.6909685134887695, - "learning_rate": 4.228052178106101e-06, - "loss": 0.4495, - "step": 528 - }, - { - "epoch": 2.580487804878049, - "grad_norm": 2.937680244445801, - "learning_rate": 4.2252815504051285e-06, - "loss": 0.2396, - "step": 529 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 5.55731201171875, - "learning_rate": 4.222506870867618e-06, - "loss": 0.6784, - "step": 530 - }, - { - "epoch": 2.5902439024390245, - "grad_norm": 2.7388782501220703, - "learning_rate": 4.2197281460099245e-06, - "loss": 0.5543, - "step": 531 - }, - { - "epoch": 2.5951219512195123, - "grad_norm": 3.311134099960327, - "learning_rate": 4.216945382357905e-06, - "loss": 0.5281, - "step": 532 - }, - { - "epoch": 2.6, - "grad_norm": 3.511232376098633, - "learning_rate": 4.214158586446901e-06, - "loss": 0.8019, - "step": 533 - }, - { - "epoch": 2.604878048780488, - "grad_norm": 4.416641712188721, - "learning_rate": 4.211367764821722e-06, - "loss": 0.7769, - "step": 534 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 2.9849908351898193, - "learning_rate": 4.208572924036634e-06, - "loss": 0.4077, - "step": 535 - }, - { - "epoch": 2.6146341463414635, - "grad_norm": 2.8512160778045654, - "learning_rate": 4.2057740706553415e-06, - "loss": 0.433, - "step": 536 - }, - { - "epoch": 2.6195121951219513, - "grad_norm": 2.6729629039764404, - "learning_rate": 4.202971211250971e-06, - "loss": 0.5957, - "step": 537 - }, - { - "epoch": 2.624390243902439, - "grad_norm": 2.4570281505584717, - "learning_rate": 4.200164352406061e-06, - "loss": 0.3013, - "step": 538 - }, - { - "epoch": 2.629268292682927, - "grad_norm": 3.3771679401397705, - "learning_rate": 4.197353500712539e-06, - "loss": 0.5646, - "step": 539 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 3.163496494293213, - "learning_rate": 4.1945386627717115e-06, - "loss": 0.4529, - "step": 540 - }, - { - "epoch": 2.6390243902439026, - "grad_norm": 8.32056713104248, - "learning_rate": 4.191719845194246e-06, - "loss": 0.6076, - "step": 541 - }, - { - "epoch": 2.6439024390243904, - "grad_norm": 2.7657363414764404, - "learning_rate": 4.188897054600156e-06, - "loss": 0.4855, - "step": 542 - }, - { - "epoch": 2.648780487804878, - "grad_norm": 3.299283504486084, - "learning_rate": 4.186070297618787e-06, - "loss": 0.5836, - "step": 543 - }, - { - "epoch": 2.653658536585366, - "grad_norm": 2.3928205966949463, - "learning_rate": 4.183239580888799e-06, - "loss": 0.6266, - "step": 544 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 3.395251750946045, - "learning_rate": 4.18040491105815e-06, - "loss": 0.429, - "step": 545 - }, - { - "epoch": 2.6634146341463416, - "grad_norm": 2.690936803817749, - "learning_rate": 4.177566294784085e-06, - "loss": 0.391, - "step": 546 - }, - { - "epoch": 2.6682926829268294, - "grad_norm": 3.7687628269195557, - "learning_rate": 4.174723738733114e-06, - "loss": 0.6548, - "step": 547 - }, - { - "epoch": 2.6731707317073172, - "grad_norm": 2.7884976863861084, - "learning_rate": 4.171877249581001e-06, - "loss": 0.5188, - "step": 548 - }, - { - "epoch": 2.678048780487805, - "grad_norm": 3.0811641216278076, - "learning_rate": 4.169026834012748e-06, - "loss": 0.3494, - "step": 549 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 3.090078592300415, - "learning_rate": 4.166172498722577e-06, - "loss": 0.3621, - "step": 550 - }, - { - "epoch": 2.68780487804878, - "grad_norm": 3.925424098968506, - "learning_rate": 4.163314250413913e-06, - "loss": 0.7187, - "step": 551 - }, - { - "epoch": 2.692682926829268, - "grad_norm": 3.3590312004089355, - "learning_rate": 4.160452095799378e-06, - "loss": 0.428, - "step": 552 - }, - { - "epoch": 2.697560975609756, - "grad_norm": 3.08093523979187, - "learning_rate": 4.157586041600759e-06, - "loss": 0.202, - "step": 553 - }, - { - "epoch": 2.7024390243902436, - "grad_norm": 2.9391448497772217, - "learning_rate": 4.154716094549008e-06, - "loss": 0.5238, - "step": 554 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 2.9869461059570312, - "learning_rate": 4.151842261384217e-06, - "loss": 0.3073, - "step": 555 - }, - { - "epoch": 2.7121951219512193, - "grad_norm": 3.8973608016967773, - "learning_rate": 4.148964548855603e-06, - "loss": 0.8435, - "step": 556 - }, - { - "epoch": 2.717073170731707, - "grad_norm": 2.3596479892730713, - "learning_rate": 4.146082963721496e-06, - "loss": 0.2562, - "step": 557 - }, - { - "epoch": 2.721951219512195, - "grad_norm": 3.4964873790740967, - "learning_rate": 4.143197512749322e-06, - "loss": 1.0144, - "step": 558 - }, - { - "epoch": 2.7268292682926827, - "grad_norm": 2.8925280570983887, - "learning_rate": 4.140308202715581e-06, - "loss": 0.7581, - "step": 559 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 2.622724771499634, - "learning_rate": 4.13741504040584e-06, - "loss": 0.3114, - "step": 560 - }, - { - "epoch": 2.7365853658536583, - "grad_norm": 3.775834321975708, - "learning_rate": 4.134518032614713e-06, - "loss": 0.4384, - "step": 561 - }, - { - "epoch": 2.741463414634146, - "grad_norm": 2.691236972808838, - "learning_rate": 4.1316171861458445e-06, - "loss": 0.3141, - "step": 562 - }, - { - "epoch": 2.746341463414634, - "grad_norm": 3.059152841567993, - "learning_rate": 4.128712507811893e-06, - "loss": 0.5777, - "step": 563 - }, - { - "epoch": 2.7512195121951217, - "grad_norm": 2.867432117462158, - "learning_rate": 4.125804004434517e-06, - "loss": 0.5542, - "step": 564 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 2.796438694000244, - "learning_rate": 4.12289168284436e-06, - "loss": 0.3442, - "step": 565 - }, - { - "epoch": 2.7609756097560973, - "grad_norm": 3.052199125289917, - "learning_rate": 4.119975549881029e-06, - "loss": 0.4754, - "step": 566 - }, - { - "epoch": 2.765853658536585, - "grad_norm": 2.5463602542877197, - "learning_rate": 4.1170556123930846e-06, - "loss": 0.2988, - "step": 567 - }, - { - "epoch": 2.770731707317073, - "grad_norm": 3.003124475479126, - "learning_rate": 4.114131877238021e-06, - "loss": 0.4642, - "step": 568 - }, - { - "epoch": 2.7756097560975608, - "grad_norm": 2.4988298416137695, - "learning_rate": 4.111204351282254e-06, - "loss": 0.3493, - "step": 569 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 2.7403693199157715, - "learning_rate": 4.108273041401098e-06, - "loss": 0.4007, - "step": 570 - }, - { - "epoch": 2.7853658536585364, - "grad_norm": 4.101940155029297, - "learning_rate": 4.105337954478756e-06, - "loss": 0.7815, - "step": 571 - }, - { - "epoch": 2.790243902439024, - "grad_norm": 3.229969024658203, - "learning_rate": 4.102399097408304e-06, - "loss": 0.6099, - "step": 572 - }, - { - "epoch": 2.795121951219512, - "grad_norm": 3.234693765640259, - "learning_rate": 4.099456477091667e-06, - "loss": 0.2478, - "step": 573 - }, - { - "epoch": 2.8, - "grad_norm": 2.9824702739715576, - "learning_rate": 4.096510100439611e-06, - "loss": 0.6403, - "step": 574 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 2.8012478351593018, - "learning_rate": 4.093559974371725e-06, - "loss": 0.2509, - "step": 575 - }, - { - "epoch": 2.8097560975609754, - "grad_norm": 2.915400743484497, - "learning_rate": 4.0906061058164e-06, - "loss": 0.7552, - "step": 576 - }, - { - "epoch": 2.8146341463414632, - "grad_norm": 3.467665672302246, - "learning_rate": 4.087648501710819e-06, - "loss": 0.3146, - "step": 577 - }, - { - "epoch": 2.819512195121951, - "grad_norm": 3.1628401279449463, - "learning_rate": 4.084687169000938e-06, - "loss": 0.507, - "step": 578 - }, - { - "epoch": 2.824390243902439, - "grad_norm": 2.4069066047668457, - "learning_rate": 4.081722114641469e-06, - "loss": 0.4116, - "step": 579 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 3.698174238204956, - "learning_rate": 4.0787533455958626e-06, - "loss": 0.2264, - "step": 580 - }, - { - "epoch": 2.8341463414634145, - "grad_norm": 3.0896191596984863, - "learning_rate": 4.075780868836296e-06, - "loss": 0.3197, - "step": 581 - }, - { - "epoch": 2.8390243902439023, - "grad_norm": 3.098562240600586, - "learning_rate": 4.072804691343653e-06, - "loss": 0.4045, - "step": 582 - }, - { - "epoch": 2.84390243902439, - "grad_norm": 3.9232118129730225, - "learning_rate": 4.069824820107507e-06, - "loss": 0.9564, - "step": 583 - }, - { - "epoch": 2.848780487804878, - "grad_norm": 2.7176268100738525, - "learning_rate": 4.06684126212611e-06, - "loss": 0.2703, - "step": 584 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 2.4905827045440674, - "learning_rate": 4.063854024406369e-06, - "loss": 0.4828, - "step": 585 - }, - { - "epoch": 2.8585365853658535, - "grad_norm": 2.848784923553467, - "learning_rate": 4.060863113963835e-06, - "loss": 0.4131, - "step": 586 - }, - { - "epoch": 2.8634146341463413, - "grad_norm": 2.599665403366089, - "learning_rate": 4.057868537822683e-06, - "loss": 0.4464, - "step": 587 - }, - { - "epoch": 2.868292682926829, - "grad_norm": 3.1770827770233154, - "learning_rate": 4.054870303015695e-06, - "loss": 0.2825, - "step": 588 - }, - { - "epoch": 2.873170731707317, - "grad_norm": 3.18332839012146, - "learning_rate": 4.05186841658425e-06, - "loss": 0.4438, - "step": 589 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 2.7485718727111816, - "learning_rate": 4.048862885578301e-06, - "loss": 0.4817, - "step": 590 - }, - { - "epoch": 2.8829268292682926, - "grad_norm": 2.9712934494018555, - "learning_rate": 4.045853717056358e-06, - "loss": 0.5157, - "step": 591 - }, - { - "epoch": 2.8878048780487804, - "grad_norm": 2.246858835220337, - "learning_rate": 4.0428409180854775e-06, - "loss": 0.4029, - "step": 592 - }, - { - "epoch": 2.892682926829268, - "grad_norm": 2.683434247970581, - "learning_rate": 4.039824495741238e-06, - "loss": 0.3796, - "step": 593 - }, - { - "epoch": 2.897560975609756, - "grad_norm": 2.6297569274902344, - "learning_rate": 4.036804457107733e-06, - "loss": 0.4467, - "step": 594 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 5.318776607513428, - "learning_rate": 4.0337808092775435e-06, - "loss": 0.7007, - "step": 595 - }, - { - "epoch": 2.9073170731707316, - "grad_norm": 3.069889783859253, - "learning_rate": 4.030753559351728e-06, - "loss": 0.3219, - "step": 596 - }, - { - "epoch": 2.9121951219512194, - "grad_norm": 1.9730123281478882, - "learning_rate": 4.027722714439808e-06, - "loss": 0.3038, - "step": 597 - }, - { - "epoch": 2.9170731707317072, - "grad_norm": 3.7959916591644287, - "learning_rate": 4.024688281659743e-06, - "loss": 0.7768, - "step": 598 - }, - { - "epoch": 2.921951219512195, - "grad_norm": 3.900886297225952, - "learning_rate": 4.021650268137924e-06, - "loss": 0.4667, - "step": 599 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 2.6155691146850586, - "learning_rate": 4.018608681009143e-06, - "loss": 0.3852, - "step": 600 - }, - { - "epoch": 2.9317073170731707, - "grad_norm": 3.2715704441070557, - "learning_rate": 4.015563527416596e-06, - "loss": 0.4804, - "step": 601 - }, - { - "epoch": 2.9365853658536585, - "grad_norm": 3.001425266265869, - "learning_rate": 4.012514814511844e-06, - "loss": 0.4152, - "step": 602 - }, - { - "epoch": 2.9414634146341463, - "grad_norm": 2.685360908508301, - "learning_rate": 4.009462549454816e-06, - "loss": 0.5029, - "step": 603 - }, - { - "epoch": 2.946341463414634, - "grad_norm": 3.4670183658599854, - "learning_rate": 4.006406739413775e-06, - "loss": 0.4857, - "step": 604 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 3.0613298416137695, - "learning_rate": 4.003347391565317e-06, - "loss": 0.4449, - "step": 605 - }, - { - "epoch": 2.9560975609756097, - "grad_norm": 3.207186698913574, - "learning_rate": 4.000284513094342e-06, - "loss": 0.4808, - "step": 606 - }, - { - "epoch": 2.9609756097560975, - "grad_norm": 2.910578727722168, - "learning_rate": 3.997218111194042e-06, - "loss": 0.4395, - "step": 607 - }, - { - "epoch": 2.9658536585365853, - "grad_norm": 2.581918954849243, - "learning_rate": 3.994148193065886e-06, - "loss": 0.3264, - "step": 608 - }, - { - "epoch": 2.970731707317073, - "grad_norm": 2.6517748832702637, - "learning_rate": 3.991074765919598e-06, - "loss": 0.3285, - "step": 609 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 3.509756088256836, - "learning_rate": 3.987997836973147e-06, - "loss": 0.3638, - "step": 610 - }, - { - "epoch": 2.9804878048780488, - "grad_norm": 2.7382352352142334, - "learning_rate": 3.984917413452721e-06, - "loss": 0.3853, - "step": 611 - }, - { - "epoch": 2.9853658536585366, - "grad_norm": 3.998974323272705, - "learning_rate": 3.981833502592717e-06, - "loss": 0.6411, - "step": 612 - }, - { - "epoch": 2.9902439024390244, - "grad_norm": 3.305126428604126, - "learning_rate": 3.978746111635725e-06, - "loss": 0.2759, - "step": 613 - }, - { - "epoch": 2.995121951219512, - "grad_norm": 3.137300968170166, - "learning_rate": 3.9756552478325045e-06, - "loss": 0.4566, - "step": 614 - }, - { - "epoch": 3.0, - "grad_norm": 2.617291212081909, - "learning_rate": 3.972560918441972e-06, - "loss": 0.2221, - "step": 615 - }, - { - "epoch": 3.004878048780488, - "grad_norm": 2.787429094314575, - "learning_rate": 3.969463130731183e-06, - "loss": 0.2403, - "step": 616 - }, - { - "epoch": 3.0097560975609756, - "grad_norm": 3.0412075519561768, - "learning_rate": 3.966361891975316e-06, - "loss": 0.2635, - "step": 617 - }, - { - "epoch": 3.0146341463414634, - "grad_norm": 2.9949851036071777, - "learning_rate": 3.963257209457652e-06, - "loss": 0.3294, - "step": 618 - }, - { - "epoch": 3.0195121951219512, - "grad_norm": 3.0510809421539307, - "learning_rate": 3.960149090469561e-06, - "loss": 0.1338, - "step": 619 - }, - { - "epoch": 3.024390243902439, - "grad_norm": 3.669482707977295, - "learning_rate": 3.957037542310484e-06, - "loss": 0.1469, - "step": 620 - }, - { - "epoch": 3.029268292682927, - "grad_norm": 4.677116870880127, - "learning_rate": 3.953922572287915e-06, - "loss": 0.2788, - "step": 621 - }, - { - "epoch": 3.0341463414634147, - "grad_norm": 4.33144474029541, - "learning_rate": 3.950804187717384e-06, - "loss": 0.4521, - "step": 622 - }, - { - "epoch": 3.0390243902439025, - "grad_norm": 3.466639757156372, - "learning_rate": 3.947682395922439e-06, - "loss": 0.5113, - "step": 623 - }, - { - "epoch": 3.0439024390243903, - "grad_norm": 3.2332122325897217, - "learning_rate": 3.9445572042346346e-06, - "loss": 0.0968, - "step": 624 - }, - { - "epoch": 3.048780487804878, - "grad_norm": 2.6108055114746094, - "learning_rate": 3.941428619993505e-06, - "loss": 0.2462, - "step": 625 - }, - { - "epoch": 3.053658536585366, - "grad_norm": 3.2512595653533936, - "learning_rate": 3.938296650546552e-06, - "loss": 0.1782, - "step": 626 - }, - { - "epoch": 3.0585365853658537, - "grad_norm": 3.4350366592407227, - "learning_rate": 3.935161303249231e-06, - "loss": 0.2955, - "step": 627 - }, - { - "epoch": 3.0634146341463415, - "grad_norm": 3.42012619972229, - "learning_rate": 3.932022585464928e-06, - "loss": 0.3259, - "step": 628 - }, - { - "epoch": 3.0682926829268293, - "grad_norm": 3.458043336868286, - "learning_rate": 3.928880504564943e-06, - "loss": 0.2306, - "step": 629 - }, - { - "epoch": 3.073170731707317, - "grad_norm": 2.646616220474243, - "learning_rate": 3.92573506792848e-06, - "loss": 0.2197, - "step": 630 - }, - { - "epoch": 3.078048780487805, - "grad_norm": 3.5558857917785645, - "learning_rate": 3.9225862829426184e-06, - "loss": 0.1607, - "step": 631 - }, - { - "epoch": 3.0829268292682928, - "grad_norm": 3.6011338233947754, - "learning_rate": 3.919434157002303e-06, - "loss": 0.3087, - "step": 632 - }, - { - "epoch": 3.0878048780487806, - "grad_norm": 2.339879035949707, - "learning_rate": 3.916278697510325e-06, - "loss": 0.2213, - "step": 633 - }, - { - "epoch": 3.0926829268292684, - "grad_norm": 3.268162488937378, - "learning_rate": 3.913119911877305e-06, - "loss": 0.318, - "step": 634 - }, - { - "epoch": 3.097560975609756, - "grad_norm": 4.062571048736572, - "learning_rate": 3.909957807521674e-06, - "loss": 0.1757, - "step": 635 - }, - { - "epoch": 3.102439024390244, - "grad_norm": 2.997659683227539, - "learning_rate": 3.906792391869657e-06, - "loss": 0.2391, - "step": 636 - }, - { - "epoch": 3.107317073170732, - "grad_norm": 3.7037394046783447, - "learning_rate": 3.903623672355258e-06, - "loss": 0.2548, - "step": 637 - }, - { - "epoch": 3.1121951219512196, - "grad_norm": 3.110579252243042, - "learning_rate": 3.900451656420237e-06, - "loss": 0.2389, - "step": 638 - }, - { - "epoch": 3.1170731707317074, - "grad_norm": 3.3332321643829346, - "learning_rate": 3.897276351514097e-06, - "loss": 0.1371, - "step": 639 - }, - { - "epoch": 3.1219512195121952, - "grad_norm": 3.8275935649871826, - "learning_rate": 3.894097765094065e-06, - "loss": 0.3363, - "step": 640 - }, - { - "epoch": 3.126829268292683, - "grad_norm": 2.3731374740600586, - "learning_rate": 3.890915904625075e-06, - "loss": 0.1314, - "step": 641 - }, - { - "epoch": 3.131707317073171, - "grad_norm": 3.1511282920837402, - "learning_rate": 3.887730777579751e-06, - "loss": 0.3563, - "step": 642 - }, - { - "epoch": 3.1365853658536587, - "grad_norm": 4.2254862785339355, - "learning_rate": 3.884542391438387e-06, - "loss": 0.5053, - "step": 643 - }, - { - "epoch": 3.1414634146341465, - "grad_norm": 4.579670429229736, - "learning_rate": 3.88135075368893e-06, - "loss": 0.6259, - "step": 644 - }, - { - "epoch": 3.1463414634146343, - "grad_norm": 3.2102746963500977, - "learning_rate": 3.878155871826968e-06, - "loss": 0.2599, - "step": 645 - }, - { - "epoch": 3.151219512195122, - "grad_norm": 2.5569686889648438, - "learning_rate": 3.874957753355701e-06, - "loss": 0.2075, - "step": 646 - }, - { - "epoch": 3.15609756097561, - "grad_norm": 3.588925838470459, - "learning_rate": 3.8717564057859365e-06, - "loss": 0.4577, - "step": 647 - }, - { - "epoch": 3.1609756097560977, - "grad_norm": 3.6163878440856934, - "learning_rate": 3.868551836636063e-06, - "loss": 0.4023, - "step": 648 - }, - { - "epoch": 3.1658536585365855, - "grad_norm": 3.8688390254974365, - "learning_rate": 3.865344053432035e-06, - "loss": 0.1669, - "step": 649 - }, - { - "epoch": 3.1707317073170733, - "grad_norm": 3.419734001159668, - "learning_rate": 3.862133063707353e-06, - "loss": 0.2766, - "step": 650 - }, - { - "epoch": 3.175609756097561, - "grad_norm": 2.9860243797302246, - "learning_rate": 3.858918875003053e-06, - "loss": 0.1788, - "step": 651 - }, - { - "epoch": 3.180487804878049, - "grad_norm": 3.0619022846221924, - "learning_rate": 3.855701494867679e-06, - "loss": 0.224, - "step": 652 - }, - { - "epoch": 3.1853658536585368, - "grad_norm": 3.3668978214263916, - "learning_rate": 3.852480930857275e-06, - "loss": 0.4029, - "step": 653 - }, - { - "epoch": 3.1902439024390246, - "grad_norm": 3.543147563934326, - "learning_rate": 3.849257190535356e-06, - "loss": 0.2096, - "step": 654 - }, - { - "epoch": 3.1951219512195124, - "grad_norm": 3.793619155883789, - "learning_rate": 3.846030281472902e-06, - "loss": 0.5574, - "step": 655 - }, - { - "epoch": 3.2, - "grad_norm": 3.021289110183716, - "learning_rate": 3.842800211248333e-06, - "loss": 0.2233, - "step": 656 - }, - { - "epoch": 3.204878048780488, - "grad_norm": 4.582934856414795, - "learning_rate": 3.839566987447492e-06, - "loss": 0.3871, - "step": 657 - }, - { - "epoch": 3.209756097560976, - "grad_norm": 2.996340274810791, - "learning_rate": 3.8363306176636296e-06, - "loss": 0.4325, - "step": 658 - }, - { - "epoch": 3.2146341463414636, - "grad_norm": 3.3190877437591553, - "learning_rate": 3.833091109497384e-06, - "loss": 0.5321, - "step": 659 - }, - { - "epoch": 3.2195121951219514, - "grad_norm": 3.2532856464385986, - "learning_rate": 3.829848470556765e-06, - "loss": 0.1359, - "step": 660 - }, - { - "epoch": 3.2243902439024392, - "grad_norm": 2.7875044345855713, - "learning_rate": 3.8266027084571335e-06, - "loss": 0.3145, - "step": 661 - }, - { - "epoch": 3.229268292682927, - "grad_norm": 3.748253583908081, - "learning_rate": 3.823353830821187e-06, - "loss": 0.1252, - "step": 662 - }, - { - "epoch": 3.234146341463415, - "grad_norm": 2.858293294906616, - "learning_rate": 3.820101845278937e-06, - "loss": 0.2589, - "step": 663 - }, - { - "epoch": 3.2390243902439027, - "grad_norm": 3.7470967769622803, - "learning_rate": 3.816846759467696e-06, - "loss": 0.2594, - "step": 664 - }, - { - "epoch": 3.2439024390243905, - "grad_norm": 3.676196813583374, - "learning_rate": 3.8135885810320587e-06, - "loss": 0.2998, - "step": 665 - }, - { - "epoch": 3.2487804878048783, - "grad_norm": 3.0943140983581543, - "learning_rate": 3.810327317623881e-06, - "loss": 0.2238, - "step": 666 - }, - { - "epoch": 3.253658536585366, - "grad_norm": 3.5907349586486816, - "learning_rate": 3.8070629769022628e-06, - "loss": 0.3381, - "step": 667 - }, - { - "epoch": 3.258536585365854, - "grad_norm": 3.1195285320281982, - "learning_rate": 3.8037955665335335e-06, - "loss": 0.2407, - "step": 668 - }, - { - "epoch": 3.2634146341463417, - "grad_norm": 3.422292947769165, - "learning_rate": 3.800525094191231e-06, - "loss": 0.2957, - "step": 669 - }, - { - "epoch": 3.2682926829268295, - "grad_norm": 2.5264663696289062, - "learning_rate": 3.797251567556083e-06, - "loss": 0.2493, - "step": 670 - }, - { - "epoch": 3.2731707317073173, - "grad_norm": 3.350219964981079, - "learning_rate": 3.793974994315991e-06, - "loss": 0.1186, - "step": 671 - }, - { - "epoch": 3.278048780487805, - "grad_norm": 4.175906181335449, - "learning_rate": 3.790695382166013e-06, - "loss": 0.3453, - "step": 672 - }, - { - "epoch": 3.2829268292682925, - "grad_norm": 3.006072521209717, - "learning_rate": 3.7874127388083415e-06, - "loss": 0.1981, - "step": 673 - }, - { - "epoch": 3.2878048780487803, - "grad_norm": 3.368561029434204, - "learning_rate": 3.7841270719522895e-06, - "loss": 0.2934, - "step": 674 - }, - { - "epoch": 3.292682926829268, - "grad_norm": 4.374331951141357, - "learning_rate": 3.7808383893142692e-06, - "loss": 0.1359, - "step": 675 - }, - { - "epoch": 3.297560975609756, - "grad_norm": 3.297102451324463, - "learning_rate": 3.7775466986177763e-06, - "loss": 0.2498, - "step": 676 - }, - { - "epoch": 3.3024390243902437, - "grad_norm": 2.8914761543273926, - "learning_rate": 3.774252007593371e-06, - "loss": 0.1308, - "step": 677 - }, - { - "epoch": 3.3073170731707315, - "grad_norm": 3.1550722122192383, - "learning_rate": 3.7709543239786593e-06, - "loss": 0.3915, - "step": 678 - }, - { - "epoch": 3.3121951219512193, - "grad_norm": 3.2302658557891846, - "learning_rate": 3.767653655518277e-06, - "loss": 0.2558, - "step": 679 - }, - { - "epoch": 3.317073170731707, - "grad_norm": 4.4321770668029785, - "learning_rate": 3.7643500099638673e-06, - "loss": 0.1988, - "step": 680 - }, - { - "epoch": 3.321951219512195, - "grad_norm": 2.970566749572754, - "learning_rate": 3.7610433950740667e-06, - "loss": 0.4908, - "step": 681 - }, - { - "epoch": 3.3268292682926828, - "grad_norm": 3.5516228675842285, - "learning_rate": 3.757733818614485e-06, - "loss": 0.304, - "step": 682 - }, - { - "epoch": 3.3317073170731706, - "grad_norm": 2.7555387020111084, - "learning_rate": 3.7544212883576856e-06, - "loss": 0.2533, - "step": 683 - }, - { - "epoch": 3.3365853658536584, - "grad_norm": 3.61226749420166, - "learning_rate": 3.751105812083172e-06, - "loss": 0.1771, - "step": 684 - }, - { - "epoch": 3.341463414634146, - "grad_norm": 3.0466206073760986, - "learning_rate": 3.7477873975773655e-06, - "loss": 0.4213, - "step": 685 - }, - { - "epoch": 3.346341463414634, - "grad_norm": 3.6091527938842773, - "learning_rate": 3.7444660526335853e-06, - "loss": 0.3808, - "step": 686 - }, - { - "epoch": 3.351219512195122, - "grad_norm": 3.8443002700805664, - "learning_rate": 3.741141785052036e-06, - "loss": 0.6438, - "step": 687 - }, - { - "epoch": 3.3560975609756096, - "grad_norm": 3.845909833908081, - "learning_rate": 3.737814602639784e-06, - "loss": 0.3686, - "step": 688 - }, - { - "epoch": 3.3609756097560974, - "grad_norm": 2.904892921447754, - "learning_rate": 3.7344845132107427e-06, - "loss": 0.2934, - "step": 689 - }, - { - "epoch": 3.3658536585365852, - "grad_norm": 3.4766387939453125, - "learning_rate": 3.731151524585651e-06, - "loss": 0.3299, - "step": 690 - }, - { - "epoch": 3.370731707317073, - "grad_norm": 4.236767768859863, - "learning_rate": 3.7278156445920584e-06, - "loss": 0.6303, - "step": 691 - }, - { - "epoch": 3.375609756097561, - "grad_norm": 3.1122591495513916, - "learning_rate": 3.724476881064303e-06, - "loss": 0.2432, - "step": 692 - }, - { - "epoch": 3.3804878048780487, - "grad_norm": 3.0971457958221436, - "learning_rate": 3.721135241843496e-06, - "loss": 0.3131, - "step": 693 - }, - { - "epoch": 3.3853658536585365, - "grad_norm": 3.9365804195404053, - "learning_rate": 3.7177907347775016e-06, - "loss": 0.3372, - "step": 694 - }, - { - "epoch": 3.3902439024390243, - "grad_norm": 3.760373115539551, - "learning_rate": 3.71444336772092e-06, - "loss": 0.5055, - "step": 695 - }, - { - "epoch": 3.395121951219512, - "grad_norm": 4.360848426818848, - "learning_rate": 3.711093148535068e-06, - "loss": 0.6183, - "step": 696 - }, - { - "epoch": 3.4, - "grad_norm": 3.7713537216186523, - "learning_rate": 3.707740085087959e-06, - "loss": 0.1568, - "step": 697 - }, - { - "epoch": 3.4048780487804877, - "grad_norm": 3.8532230854034424, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.2826, - "step": 698 - }, - { - "epoch": 3.4097560975609755, - "grad_norm": 3.0548605918884277, - "learning_rate": 3.701025456915411e-06, - "loss": 0.1918, - "step": 699 - }, - { - "epoch": 3.4146341463414633, - "grad_norm": 3.2431821823120117, - "learning_rate": 3.697663907959327e-06, - "loss": 0.2493, - "step": 700 - }, - { - "epoch": 3.419512195121951, - "grad_norm": 3.7301864624023438, - "learning_rate": 3.6942995462806574e-06, - "loss": 0.4913, - "step": 701 - }, - { - "epoch": 3.424390243902439, - "grad_norm": 2.5468900203704834, - "learning_rate": 3.6909323797806314e-06, - "loss": 0.1788, - "step": 702 - }, - { - "epoch": 3.4292682926829268, - "grad_norm": 3.3719515800476074, - "learning_rate": 3.6875624163670635e-06, - "loss": 0.4162, - "step": 703 - }, - { - "epoch": 3.4341463414634146, - "grad_norm": 3.528010368347168, - "learning_rate": 3.6841896639543394e-06, - "loss": 0.1924, - "step": 704 - }, - { - "epoch": 3.4390243902439024, - "grad_norm": 3.3636631965637207, - "learning_rate": 3.6808141304633924e-06, - "loss": 0.3177, - "step": 705 - }, - { - "epoch": 3.44390243902439, - "grad_norm": 3.418705463409424, - "learning_rate": 3.6774358238216878e-06, - "loss": 0.2301, - "step": 706 - }, - { - "epoch": 3.448780487804878, - "grad_norm": 4.720373630523682, - "learning_rate": 3.6740547519632048e-06, - "loss": 0.1894, - "step": 707 - }, - { - "epoch": 3.453658536585366, - "grad_norm": 2.9635703563690186, - "learning_rate": 3.670670922828414e-06, - "loss": 0.2642, - "step": 708 - }, - { - "epoch": 3.4585365853658536, - "grad_norm": 4.934754371643066, - "learning_rate": 3.667284344364264e-06, - "loss": 0.2275, - "step": 709 - }, - { - "epoch": 3.4634146341463414, - "grad_norm": 3.090585231781006, - "learning_rate": 3.6638950245241604e-06, - "loss": 0.4447, - "step": 710 - }, - { - "epoch": 3.4682926829268292, - "grad_norm": 4.360495090484619, - "learning_rate": 3.660502971267945e-06, - "loss": 0.2415, - "step": 711 - }, - { - "epoch": 3.473170731707317, - "grad_norm": 3.4893476963043213, - "learning_rate": 3.65710819256188e-06, - "loss": 0.0921, - "step": 712 - }, - { - "epoch": 3.478048780487805, - "grad_norm": 3.2423770427703857, - "learning_rate": 3.65371069637863e-06, - "loss": 0.2371, - "step": 713 - }, - { - "epoch": 3.4829268292682927, - "grad_norm": 3.0775890350341797, - "learning_rate": 3.650310490697238e-06, - "loss": 0.4026, - "step": 714 - }, - { - "epoch": 3.4878048780487805, - "grad_norm": 3.906625270843506, - "learning_rate": 3.646907583503114e-06, - "loss": 0.4312, - "step": 715 - }, - { - "epoch": 3.4926829268292683, - "grad_norm": 3.2140414714813232, - "learning_rate": 3.6435019827880093e-06, - "loss": 0.2309, - "step": 716 - }, - { - "epoch": 3.497560975609756, - "grad_norm": 3.048523426055908, - "learning_rate": 3.640093696550003e-06, - "loss": 0.296, - "step": 717 - }, - { - "epoch": 3.502439024390244, - "grad_norm": 2.9669039249420166, - "learning_rate": 3.6366827327934817e-06, - "loss": 0.2723, - "step": 718 - }, - { - "epoch": 3.5073170731707317, - "grad_norm": 3.6941726207733154, - "learning_rate": 3.6332690995291176e-06, - "loss": 0.3797, - "step": 719 - }, - { - "epoch": 3.5121951219512195, - "grad_norm": 5.135766506195068, - "learning_rate": 3.6298528047738545e-06, - "loss": 0.9868, - "step": 720 - }, - { - "epoch": 3.5170731707317073, - "grad_norm": 3.2021052837371826, - "learning_rate": 3.626433856550886e-06, - "loss": 0.4069, - "step": 721 - }, - { - "epoch": 3.521951219512195, - "grad_norm": 3.094444513320923, - "learning_rate": 3.623012262889637e-06, - "loss": 0.3368, - "step": 722 - }, - { - "epoch": 3.526829268292683, - "grad_norm": 3.609285354614258, - "learning_rate": 3.6195880318257465e-06, - "loss": 0.3972, - "step": 723 - }, - { - "epoch": 3.5317073170731708, - "grad_norm": 4.236501216888428, - "learning_rate": 3.616161171401046e-06, - "loss": 0.52, - "step": 724 - }, - { - "epoch": 3.5365853658536586, - "grad_norm": 3.504526376724243, - "learning_rate": 3.612731689663542e-06, - "loss": 0.23, - "step": 725 - }, - { - "epoch": 3.5414634146341464, - "grad_norm": 3.233591079711914, - "learning_rate": 3.6092995946673996e-06, - "loss": 0.4151, - "step": 726 - }, - { - "epoch": 3.546341463414634, - "grad_norm": 3.6701886653900146, - "learning_rate": 3.605864894472918e-06, - "loss": 0.2798, - "step": 727 - }, - { - "epoch": 3.551219512195122, - "grad_norm": 3.8713181018829346, - "learning_rate": 3.602427597146516e-06, - "loss": 0.4336, - "step": 728 - }, - { - "epoch": 3.55609756097561, - "grad_norm": 5.49612283706665, - "learning_rate": 3.5989877107607134e-06, - "loss": 0.4803, - "step": 729 - }, - { - "epoch": 3.5609756097560976, - "grad_norm": 3.771005392074585, - "learning_rate": 3.5955452433941075e-06, - "loss": 0.3698, - "step": 730 - }, - { - "epoch": 3.5658536585365854, - "grad_norm": 2.970822334289551, - "learning_rate": 3.5921002031313586e-06, - "loss": 0.2373, - "step": 731 - }, - { - "epoch": 3.5707317073170732, - "grad_norm": 3.517249584197998, - "learning_rate": 3.58865259806317e-06, - "loss": 0.1908, - "step": 732 - }, - { - "epoch": 3.575609756097561, - "grad_norm": 3.6825428009033203, - "learning_rate": 3.585202436286267e-06, - "loss": 0.3993, - "step": 733 - }, - { - "epoch": 3.580487804878049, - "grad_norm": 3.387479066848755, - "learning_rate": 3.581749725903381e-06, - "loss": 0.4237, - "step": 734 - }, - { - "epoch": 3.5853658536585367, - "grad_norm": 3.5004806518554688, - "learning_rate": 3.5782944750232274e-06, - "loss": 0.3011, - "step": 735 - }, - { - "epoch": 3.5902439024390245, - "grad_norm": 3.461731433868408, - "learning_rate": 3.574836691760489e-06, - "loss": 0.0896, - "step": 736 - }, - { - "epoch": 3.5951219512195123, - "grad_norm": 3.9598381519317627, - "learning_rate": 3.571376384235795e-06, - "loss": 0.2751, - "step": 737 - }, - { - "epoch": 3.6, - "grad_norm": 4.053933143615723, - "learning_rate": 3.5679135605757035e-06, - "loss": 0.2086, - "step": 738 - }, - { - "epoch": 3.604878048780488, - "grad_norm": 2.9683544635772705, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1659, - "step": 739 - }, - { - "epoch": 3.6097560975609757, - "grad_norm": 3.6598448753356934, - "learning_rate": 3.5609803973850877e-06, - "loss": 0.2469, - "step": 740 - }, - { - "epoch": 3.6146341463414635, - "grad_norm": 3.449335813522339, - "learning_rate": 3.557510074137147e-06, - "loss": 0.375, - "step": 741 - }, - { - "epoch": 3.6195121951219513, - "grad_norm": 2.7666923999786377, - "learning_rate": 3.554037267318942e-06, - "loss": 0.3133, - "step": 742 - }, - { - "epoch": 3.624390243902439, - "grad_norm": 2.8951869010925293, - "learning_rate": 3.5505619850863847e-06, - "loss": 0.2243, - "step": 743 - }, - { - "epoch": 3.629268292682927, - "grad_norm": 3.477747678756714, - "learning_rate": 3.5470842356012007e-06, - "loss": 0.1321, - "step": 744 - }, - { - "epoch": 3.6341463414634148, - "grad_norm": 3.810480833053589, - "learning_rate": 3.5436040270309113e-06, - "loss": 0.361, - "step": 745 - }, - { - "epoch": 3.6390243902439026, - "grad_norm": 3.0730793476104736, - "learning_rate": 3.540121367548811e-06, - "loss": 0.1523, - "step": 746 - }, - { - "epoch": 3.6439024390243904, - "grad_norm": 3.6878390312194824, - "learning_rate": 3.5366362653339524e-06, - "loss": 0.4898, - "step": 747 - }, - { - "epoch": 3.648780487804878, - "grad_norm": 3.6432242393493652, - "learning_rate": 3.533148728571124e-06, - "loss": 0.1397, - "step": 748 - }, - { - "epoch": 3.653658536585366, - "grad_norm": 3.7047760486602783, - "learning_rate": 3.5296587654508317e-06, - "loss": 0.323, - "step": 749 - }, - { - "epoch": 3.658536585365854, - "grad_norm": 3.777132749557495, - "learning_rate": 3.526166384169279e-06, - "loss": 0.5577, - "step": 750 - }, - { - "epoch": 3.6634146341463416, - "grad_norm": 3.7970924377441406, - "learning_rate": 3.5226715929283507e-06, - "loss": 0.245, - "step": 751 - }, - { - "epoch": 3.6682926829268294, - "grad_norm": 2.8203537464141846, - "learning_rate": 3.519174399935588e-06, - "loss": 0.1619, - "step": 752 - }, - { - "epoch": 3.6731707317073172, - "grad_norm": 3.4040987491607666, - "learning_rate": 3.5156748134041767e-06, - "loss": 0.1047, - "step": 753 - }, - { - "epoch": 3.678048780487805, - "grad_norm": 3.927960157394409, - "learning_rate": 3.5121728415529203e-06, - "loss": 0.5713, - "step": 754 - }, - { - "epoch": 3.682926829268293, - "grad_norm": 3.3833277225494385, - "learning_rate": 3.5086684926062266e-06, - "loss": 0.2174, - "step": 755 - }, - { - "epoch": 3.68780487804878, - "grad_norm": 3.989307403564453, - "learning_rate": 3.505161774794085e-06, - "loss": 0.285, - "step": 756 - }, - { - "epoch": 3.692682926829268, - "grad_norm": 2.742429494857788, - "learning_rate": 3.5016526963520474e-06, - "loss": 0.1602, - "step": 757 - }, - { - "epoch": 3.697560975609756, - "grad_norm": 3.7082698345184326, - "learning_rate": 3.498141265521212e-06, - "loss": 0.666, - "step": 758 - }, - { - "epoch": 3.7024390243902436, - "grad_norm": 3.033196210861206, - "learning_rate": 3.4946274905481997e-06, - "loss": 0.2024, - "step": 759 - }, - { - "epoch": 3.7073170731707314, - "grad_norm": 3.7145371437072754, - "learning_rate": 3.4911113796851364e-06, - "loss": 0.2719, - "step": 760 - }, - { - "epoch": 3.7121951219512193, - "grad_norm": 3.580298900604248, - "learning_rate": 3.487592941189636e-06, - "loss": 0.1537, - "step": 761 - }, - { - "epoch": 3.717073170731707, - "grad_norm": 4.753757953643799, - "learning_rate": 3.484072183324776e-06, - "loss": 0.6149, - "step": 762 - }, - { - "epoch": 3.721951219512195, - "grad_norm": 3.5575687885284424, - "learning_rate": 3.4805491143590823e-06, - "loss": 0.4241, - "step": 763 - }, - { - "epoch": 3.7268292682926827, - "grad_norm": 3.215224266052246, - "learning_rate": 3.4770237425665103e-06, - "loss": 0.3037, - "step": 764 - }, - { - "epoch": 3.7317073170731705, - "grad_norm": 2.9899685382843018, - "learning_rate": 3.4734960762264204e-06, - "loss": 0.4854, - "step": 765 - }, - { - "epoch": 3.7365853658536583, - "grad_norm": 3.5880227088928223, - "learning_rate": 3.469966123623563e-06, - "loss": 0.3849, - "step": 766 - }, - { - "epoch": 3.741463414634146, - "grad_norm": 3.472750186920166, - "learning_rate": 3.46643389304806e-06, - "loss": 0.3159, - "step": 767 - }, - { - "epoch": 3.746341463414634, - "grad_norm": 4.355650901794434, - "learning_rate": 3.4628993927953786e-06, - "loss": 0.7527, - "step": 768 - }, - { - "epoch": 3.7512195121951217, - "grad_norm": 2.94575834274292, - "learning_rate": 3.45936263116632e-06, - "loss": 0.1716, - "step": 769 - }, - { - "epoch": 3.7560975609756095, - "grad_norm": 2.991525173187256, - "learning_rate": 3.4558236164669957e-06, - "loss": 0.2061, - "step": 770 - }, - { - "epoch": 3.7609756097560973, - "grad_norm": 3.134000301361084, - "learning_rate": 3.4522823570088073e-06, - "loss": 0.1338, - "step": 771 - }, - { - "epoch": 3.765853658536585, - "grad_norm": 3.722140312194824, - "learning_rate": 3.4487388611084295e-06, - "loss": 0.2615, - "step": 772 - }, - { - "epoch": 3.770731707317073, - "grad_norm": 3.7941153049468994, - "learning_rate": 3.445193137087788e-06, - "loss": 0.1401, - "step": 773 - }, - { - "epoch": 3.7756097560975608, - "grad_norm": 2.872941732406616, - "learning_rate": 3.4416451932740424e-06, - "loss": 0.2934, - "step": 774 - }, - { - "epoch": 3.7804878048780486, - "grad_norm": 4.5019941329956055, - "learning_rate": 3.4380950379995652e-06, - "loss": 0.4579, - "step": 775 - }, - { - "epoch": 3.7853658536585364, - "grad_norm": 2.682884931564331, - "learning_rate": 3.434542679601922e-06, - "loss": 0.2979, - "step": 776 - }, - { - "epoch": 3.790243902439024, - "grad_norm": 3.3044273853302, - "learning_rate": 3.4309881264238538e-06, - "loss": 0.1196, - "step": 777 - }, - { - "epoch": 3.795121951219512, - "grad_norm": 3.102760076522827, - "learning_rate": 3.4274313868132547e-06, - "loss": 0.2026, - "step": 778 - }, - { - "epoch": 3.8, - "grad_norm": 3.3304500579833984, - "learning_rate": 3.4238724691231534e-06, - "loss": 0.2135, - "step": 779 - }, - { - "epoch": 3.8048780487804876, - "grad_norm": 3.295119047164917, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.4418, - "step": 780 - }, - { - "epoch": 3.8097560975609754, - "grad_norm": 3.6655640602111816, - "learning_rate": 3.4167481329421204e-06, - "loss": 0.203, - "step": 781 - }, - { - "epoch": 3.8146341463414632, - "grad_norm": 3.387830972671509, - "learning_rate": 3.4131827311827447e-06, - "loss": 0.3225, - "step": 782 - }, - { - "epoch": 3.819512195121951, - "grad_norm": 2.621633529663086, - "learning_rate": 3.4096151848069416e-06, - "loss": 0.1704, - "step": 783 - }, - { - "epoch": 3.824390243902439, - "grad_norm": 2.974344491958618, - "learning_rate": 3.4060455021931195e-06, - "loss": 0.2785, - "step": 784 - }, - { - "epoch": 3.8292682926829267, - "grad_norm": 3.452131748199463, - "learning_rate": 3.402473691724704e-06, - "loss": 0.223, - "step": 785 - }, - { - "epoch": 3.8341463414634145, - "grad_norm": 2.6373705863952637, - "learning_rate": 3.39889976179012e-06, - "loss": 0.2368, - "step": 786 - }, - { - "epoch": 3.8390243902439023, - "grad_norm": 2.863184928894043, - "learning_rate": 3.3953237207827673e-06, - "loss": 0.3294, - "step": 787 - }, - { - "epoch": 3.84390243902439, - "grad_norm": 5.104704856872559, - "learning_rate": 3.391745577101005e-06, - "loss": 0.5431, - "step": 788 - }, - { - "epoch": 3.848780487804878, - "grad_norm": 3.951310634613037, - "learning_rate": 3.3881653391481306e-06, - "loss": 0.2546, - "step": 789 - }, - { - "epoch": 3.8536585365853657, - "grad_norm": 3.9903225898742676, - "learning_rate": 3.384583015332359e-06, - "loss": 0.3293, - "step": 790 - }, - { - "epoch": 3.8585365853658535, - "grad_norm": 3.3149220943450928, - "learning_rate": 3.380998614066805e-06, - "loss": 0.1861, - "step": 791 - }, - { - "epoch": 3.8634146341463413, - "grad_norm": 3.6755223274230957, - "learning_rate": 3.3774121437694606e-06, - "loss": 0.2498, - "step": 792 - }, - { - "epoch": 3.868292682926829, - "grad_norm": 3.192918300628662, - "learning_rate": 3.3738236128631786e-06, - "loss": 0.1525, - "step": 793 - }, - { - "epoch": 3.873170731707317, - "grad_norm": 3.5358777046203613, - "learning_rate": 3.3702330297756503e-06, - "loss": 0.3622, - "step": 794 - }, - { - "epoch": 3.8780487804878048, - "grad_norm": 3.619878053665161, - "learning_rate": 3.366640402939387e-06, - "loss": 0.1051, - "step": 795 - }, - { - "epoch": 3.8829268292682926, - "grad_norm": 7.085352420806885, - "learning_rate": 3.363045740791698e-06, - "loss": 0.4606, - "step": 796 - }, - { - "epoch": 3.8878048780487804, - "grad_norm": 2.523165464401245, - "learning_rate": 3.3594490517746774e-06, - "loss": 0.2267, - "step": 797 - }, - { - "epoch": 3.892682926829268, - "grad_norm": 2.7026922702789307, - "learning_rate": 3.3558503443351733e-06, - "loss": 0.2792, - "step": 798 - }, - { - "epoch": 3.897560975609756, - "grad_norm": 2.9232428073883057, - "learning_rate": 3.352249626924777e-06, - "loss": 0.2579, - "step": 799 - }, - { - "epoch": 3.902439024390244, - "grad_norm": 4.760788440704346, - "learning_rate": 3.348646907999801e-06, - "loss": 0.6983, - "step": 800 - }, - { - "epoch": 3.9073170731707316, - "grad_norm": 3.198249578475952, - "learning_rate": 3.345042196021257e-06, - "loss": 0.3265, - "step": 801 - }, - { - "epoch": 3.9121951219512194, - "grad_norm": 4.069286823272705, - "learning_rate": 3.3414354994548385e-06, - "loss": 0.497, - "step": 802 - }, - { - "epoch": 3.9170731707317072, - "grad_norm": 3.4435410499572754, - "learning_rate": 3.337826826770898e-06, - "loss": 0.2812, - "step": 803 - }, - { - "epoch": 3.921951219512195, - "grad_norm": 3.9805212020874023, - "learning_rate": 3.3342161864444312e-06, - "loss": 0.2277, - "step": 804 - }, - { - "epoch": 3.926829268292683, - "grad_norm": 3.348925828933716, - "learning_rate": 3.3306035869550534e-06, - "loss": 0.1614, - "step": 805 - }, - { - "epoch": 3.9317073170731707, - "grad_norm": 4.7613701820373535, - "learning_rate": 3.326989036786981e-06, - "loss": 0.3269, - "step": 806 - }, - { - "epoch": 3.9365853658536585, - "grad_norm": 3.807502508163452, - "learning_rate": 3.3233725444290126e-06, - "loss": 0.2619, - "step": 807 - }, - { - "epoch": 3.9414634146341463, - "grad_norm": 3.2690203189849854, - "learning_rate": 3.3197541183745065e-06, - "loss": 0.4334, - "step": 808 - }, - { - "epoch": 3.946341463414634, - "grad_norm": 3.396993398666382, - "learning_rate": 3.3161337671213634e-06, - "loss": 0.2738, - "step": 809 - }, - { - "epoch": 3.951219512195122, - "grad_norm": 3.086669921875, - "learning_rate": 3.312511499172006e-06, - "loss": 0.1597, - "step": 810 - }, - { - "epoch": 3.9560975609756097, - "grad_norm": 3.5688745975494385, - "learning_rate": 3.3088873230333562e-06, - "loss": 0.3195, - "step": 811 - }, - { - "epoch": 3.9609756097560975, - "grad_norm": 3.4843621253967285, - "learning_rate": 3.3052612472168193e-06, - "loss": 0.1865, - "step": 812 - }, - { - "epoch": 3.9658536585365853, - "grad_norm": 2.8479580879211426, - "learning_rate": 3.3016332802382618e-06, - "loss": 0.3108, - "step": 813 - }, - { - "epoch": 3.970731707317073, - "grad_norm": 3.3241543769836426, - "learning_rate": 3.2980034306179897e-06, - "loss": 0.2099, - "step": 814 - }, - { - "epoch": 3.975609756097561, - "grad_norm": 2.817675828933716, - "learning_rate": 3.294371706880733e-06, - "loss": 0.3073, - "step": 815 - }, - { - "epoch": 3.9804878048780488, - "grad_norm": 2.9535388946533203, - "learning_rate": 3.290738117555622e-06, - "loss": 0.2024, - "step": 816 - }, - { - "epoch": 3.9853658536585366, - "grad_norm": 5.021281719207764, - "learning_rate": 3.2871026711761666e-06, - "loss": 0.508, - "step": 817 - }, - { - "epoch": 3.9902439024390244, - "grad_norm": 3.3377649784088135, - "learning_rate": 3.2834653762802414e-06, - "loss": 0.2116, - "step": 818 - }, - { - "epoch": 3.995121951219512, - "grad_norm": 4.412073135375977, - "learning_rate": 3.2798262414100594e-06, - "loss": 0.2177, - "step": 819 - }, - { - "epoch": 4.0, - "grad_norm": 3.174323797225952, - "learning_rate": 3.2761852751121566e-06, - "loss": 0.1737, - "step": 820 - }, - { - "epoch": 4.004878048780488, - "grad_norm": 2.921494960784912, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2569, - "step": 821 - }, - { - "epoch": 4.009756097560976, - "grad_norm": 2.693495512008667, - "learning_rate": 3.2688978824408136e-06, - "loss": 0.1621, - "step": 822 - }, - { - "epoch": 4.014634146341463, - "grad_norm": 2.705796718597412, - "learning_rate": 3.2652514731818698e-06, - "loss": 0.1121, - "step": 823 - }, - { - "epoch": 4.019512195121951, - "grad_norm": 3.2621448040008545, - "learning_rate": 3.2616032667241564e-06, - "loss": 0.0835, - "step": 824 - }, - { - "epoch": 4.024390243902439, - "grad_norm": 3.6205084323883057, - "learning_rate": 3.257953271635513e-06, - "loss": 0.3731, - "step": 825 - }, - { - "epoch": 4.029268292682927, - "grad_norm": 3.2600371837615967, - "learning_rate": 3.2543014964879814e-06, - "loss": 0.1051, - "step": 826 - }, - { - "epoch": 4.034146341463415, - "grad_norm": 3.865178346633911, - "learning_rate": 3.250647949857781e-06, - "loss": 0.0916, - "step": 827 - }, - { - "epoch": 4.0390243902439025, - "grad_norm": 6.9700927734375, - "learning_rate": 3.2469926403252932e-06, - "loss": 0.4037, - "step": 828 - }, - { - "epoch": 4.04390243902439, - "grad_norm": 3.658712148666382, - "learning_rate": 3.2433355764750417e-06, - "loss": 0.0523, - "step": 829 - }, - { - "epoch": 4.048780487804878, - "grad_norm": 4.911301612854004, - "learning_rate": 3.2396767668956656e-06, - "loss": 0.2616, - "step": 830 - }, - { - "epoch": 4.053658536585366, - "grad_norm": 5.019360542297363, - "learning_rate": 3.2360162201799085e-06, - "loss": 0.195, - "step": 831 - }, - { - "epoch": 4.058536585365854, - "grad_norm": 3.493767261505127, - "learning_rate": 3.2323539449245906e-06, - "loss": 0.1245, - "step": 832 - }, - { - "epoch": 4.0634146341463415, - "grad_norm": 4.246248722076416, - "learning_rate": 3.2286899497305917e-06, - "loss": 0.1147, - "step": 833 - }, - { - "epoch": 4.068292682926829, - "grad_norm": 2.993704319000244, - "learning_rate": 3.2250242432028335e-06, - "loss": 0.2189, - "step": 834 - }, - { - "epoch": 4.073170731707317, - "grad_norm": 4.695023059844971, - "learning_rate": 3.221356833950254e-06, - "loss": 0.4685, - "step": 835 - }, - { - "epoch": 4.078048780487805, - "grad_norm": 2.777644634246826, - "learning_rate": 3.21768773058579e-06, - "loss": 0.1245, - "step": 836 - }, - { - "epoch": 4.082926829268293, - "grad_norm": 3.3545901775360107, - "learning_rate": 3.21401694172636e-06, - "loss": 0.1342, - "step": 837 - }, - { - "epoch": 4.087804878048781, - "grad_norm": 2.2222652435302734, - "learning_rate": 3.2103444759928383e-06, - "loss": 0.0484, - "step": 838 - }, - { - "epoch": 4.092682926829268, - "grad_norm": 2.580345630645752, - "learning_rate": 3.2066703420100377e-06, - "loss": 0.0592, - "step": 839 - }, - { - "epoch": 4.097560975609756, - "grad_norm": 3.8652923107147217, - "learning_rate": 3.2029945484066883e-06, - "loss": 0.2536, - "step": 840 - }, - { - "epoch": 4.102439024390244, - "grad_norm": 3.0441582202911377, - "learning_rate": 3.1993171038154203e-06, - "loss": 0.1221, - "step": 841 - }, - { - "epoch": 4.107317073170732, - "grad_norm": 2.2795114517211914, - "learning_rate": 3.1956380168727385e-06, - "loss": 0.1231, - "step": 842 - }, - { - "epoch": 4.11219512195122, - "grad_norm": 3.701009750366211, - "learning_rate": 3.191957296219007e-06, - "loss": 0.2144, - "step": 843 - }, - { - "epoch": 4.117073170731707, - "grad_norm": 3.452637195587158, - "learning_rate": 3.1882749504984247e-06, - "loss": 0.1026, - "step": 844 - }, - { - "epoch": 4.121951219512195, - "grad_norm": 2.4208810329437256, - "learning_rate": 3.1845909883590076e-06, - "loss": 0.1124, - "step": 845 - }, - { - "epoch": 4.126829268292683, - "grad_norm": 4.353063583374023, - "learning_rate": 3.180905418452569e-06, - "loss": 0.2804, - "step": 846 - }, - { - "epoch": 4.131707317073171, - "grad_norm": 3.1151084899902344, - "learning_rate": 3.1772182494346963e-06, - "loss": 0.1748, - "step": 847 - }, - { - "epoch": 4.136585365853659, - "grad_norm": 3.457940101623535, - "learning_rate": 3.1735294899647344e-06, - "loss": 0.1984, - "step": 848 - }, - { - "epoch": 4.1414634146341465, - "grad_norm": 3.3556935787200928, - "learning_rate": 3.169839148705762e-06, - "loss": 0.1332, - "step": 849 - }, - { - "epoch": 4.146341463414634, - "grad_norm": 3.5510823726654053, - "learning_rate": 3.1661472343245725e-06, - "loss": 0.4788, - "step": 850 - }, - { - "epoch": 4.151219512195122, - "grad_norm": 4.036712646484375, - "learning_rate": 3.162453755491655e-06, - "loss": 0.2437, - "step": 851 - }, - { - "epoch": 4.15609756097561, - "grad_norm": 4.417062282562256, - "learning_rate": 3.158758720881171e-06, - "loss": 0.203, - "step": 852 - }, - { - "epoch": 4.160975609756098, - "grad_norm": 3.920558214187622, - "learning_rate": 3.155062139170937e-06, - "loss": 0.1462, - "step": 853 - }, - { - "epoch": 4.1658536585365855, - "grad_norm": 6.472081661224365, - "learning_rate": 3.1513640190424034e-06, - "loss": 0.0972, - "step": 854 - }, - { - "epoch": 4.170731707317073, - "grad_norm": 3.975947141647339, - "learning_rate": 3.147664369180632e-06, - "loss": 0.1092, - "step": 855 - }, - { - "epoch": 4.175609756097561, - "grad_norm": 4.977376937866211, - "learning_rate": 3.143963198274278e-06, - "loss": 0.2215, - "step": 856 - }, - { - "epoch": 4.180487804878049, - "grad_norm": 3.595460891723633, - "learning_rate": 3.140260515015569e-06, - "loss": 0.1771, - "step": 857 - }, - { - "epoch": 4.185365853658537, - "grad_norm": 3.1085658073425293, - "learning_rate": 3.136556328100284e-06, - "loss": 0.1995, - "step": 858 - }, - { - "epoch": 4.190243902439025, - "grad_norm": 4.355626583099365, - "learning_rate": 3.132850646227734e-06, - "loss": 0.4048, - "step": 859 - }, - { - "epoch": 4.195121951219512, - "grad_norm": 3.8079614639282227, - "learning_rate": 3.12914347810074e-06, - "loss": 0.1914, - "step": 860 - }, - { - "epoch": 4.2, - "grad_norm": 3.725804328918457, - "learning_rate": 3.125434832425613e-06, - "loss": 0.1579, - "step": 861 - }, - { - "epoch": 4.204878048780488, - "grad_norm": 2.974649667739868, - "learning_rate": 3.121724717912138e-06, - "loss": 0.1814, - "step": 862 - }, - { - "epoch": 4.209756097560976, - "grad_norm": 3.6391279697418213, - "learning_rate": 3.118013143273542e-06, - "loss": 0.1481, - "step": 863 - }, - { - "epoch": 4.214634146341464, - "grad_norm": 3.216643810272217, - "learning_rate": 3.1143001172264893e-06, - "loss": 0.113, - "step": 864 - }, - { - "epoch": 4.219512195121951, - "grad_norm": 3.605855941772461, - "learning_rate": 3.1105856484910474e-06, - "loss": 0.1405, - "step": 865 - }, - { - "epoch": 4.224390243902439, - "grad_norm": 2.7186765670776367, - "learning_rate": 3.1068697457906736e-06, - "loss": 0.097, - "step": 866 - }, - { - "epoch": 4.229268292682927, - "grad_norm": 3.980973243713379, - "learning_rate": 3.1031524178521938e-06, - "loss": 0.2207, - "step": 867 - }, - { - "epoch": 4.234146341463415, - "grad_norm": 3.4623806476593018, - "learning_rate": 3.0994336734057804e-06, - "loss": 0.0552, - "step": 868 - }, - { - "epoch": 4.239024390243903, - "grad_norm": 3.7556748390197754, - "learning_rate": 3.0957135211849315e-06, - "loss": 0.1743, - "step": 869 - }, - { - "epoch": 4.2439024390243905, - "grad_norm": 3.3547914028167725, - "learning_rate": 3.0919919699264535e-06, - "loss": 0.1195, - "step": 870 - }, - { - "epoch": 4.248780487804878, - "grad_norm": 4.392014503479004, - "learning_rate": 3.0882690283704355e-06, - "loss": 0.6174, - "step": 871 - }, - { - "epoch": 4.253658536585366, - "grad_norm": 2.7031409740448, - "learning_rate": 3.084544705260234e-06, - "loss": 0.1359, - "step": 872 - }, - { - "epoch": 4.258536585365854, - "grad_norm": 2.3518481254577637, - "learning_rate": 3.080819009342451e-06, - "loss": 0.0786, - "step": 873 - }, - { - "epoch": 4.263414634146342, - "grad_norm": 2.636204481124878, - "learning_rate": 3.077091949366908e-06, - "loss": 0.0677, - "step": 874 - }, - { - "epoch": 4.2682926829268295, - "grad_norm": 2.8670942783355713, - "learning_rate": 3.073363534086636e-06, - "loss": 0.1084, - "step": 875 - }, - { - "epoch": 4.273170731707317, - "grad_norm": 2.7044737339019775, - "learning_rate": 3.0696337722578444e-06, - "loss": 0.0681, - "step": 876 - }, - { - "epoch": 4.278048780487805, - "grad_norm": 3.481539487838745, - "learning_rate": 3.0659026726399072e-06, - "loss": 0.2262, - "step": 877 - }, - { - "epoch": 4.282926829268293, - "grad_norm": 3.7746224403381348, - "learning_rate": 3.0621702439953393e-06, - "loss": 0.2169, - "step": 878 - }, - { - "epoch": 4.287804878048781, - "grad_norm": 3.6386263370513916, - "learning_rate": 3.0584364950897768e-06, - "loss": 0.0581, - "step": 879 - }, - { - "epoch": 4.2926829268292686, - "grad_norm": 3.389408588409424, - "learning_rate": 3.0547014346919574e-06, - "loss": 0.1687, - "step": 880 - }, - { - "epoch": 4.297560975609756, - "grad_norm": 3.6510157585144043, - "learning_rate": 3.0509650715736977e-06, - "loss": 0.1362, - "step": 881 - }, - { - "epoch": 4.302439024390244, - "grad_norm": 3.334210157394409, - "learning_rate": 3.0472274145098744e-06, - "loss": 0.1865, - "step": 882 - }, - { - "epoch": 4.307317073170732, - "grad_norm": 4.747341632843018, - "learning_rate": 3.0434884722784026e-06, - "loss": 0.2385, - "step": 883 - }, - { - "epoch": 4.31219512195122, - "grad_norm": 3.9266858100891113, - "learning_rate": 3.0397482536602168e-06, - "loss": 0.1004, - "step": 884 - }, - { - "epoch": 4.317073170731708, - "grad_norm": 2.984821081161499, - "learning_rate": 3.0360067674392475e-06, - "loss": 0.1469, - "step": 885 - }, - { - "epoch": 4.321951219512195, - "grad_norm": 2.6379380226135254, - "learning_rate": 3.0322640224024024e-06, - "loss": 0.0829, - "step": 886 - }, - { - "epoch": 4.326829268292683, - "grad_norm": 3.885495185852051, - "learning_rate": 3.0285200273395478e-06, - "loss": 0.2256, - "step": 887 - }, - { - "epoch": 4.331707317073171, - "grad_norm": 3.950394868850708, - "learning_rate": 3.024774791043481e-06, - "loss": 0.2402, - "step": 888 - }, - { - "epoch": 4.336585365853659, - "grad_norm": 4.147830963134766, - "learning_rate": 3.021028322309921e-06, - "loss": 0.2198, - "step": 889 - }, - { - "epoch": 4.341463414634147, - "grad_norm": 4.0821638107299805, - "learning_rate": 3.0172806299374734e-06, - "loss": 0.2304, - "step": 890 - }, - { - "epoch": 4.3463414634146345, - "grad_norm": 4.142312049865723, - "learning_rate": 3.0135317227276247e-06, - "loss": 0.2864, - "step": 891 - }, - { - "epoch": 4.351219512195122, - "grad_norm": 3.008504867553711, - "learning_rate": 3.0097816094847104e-06, - "loss": 0.2045, - "step": 892 - }, - { - "epoch": 4.35609756097561, - "grad_norm": 3.1674623489379883, - "learning_rate": 3.0060302990158984e-06, - "loss": 0.0864, - "step": 893 - }, - { - "epoch": 4.360975609756098, - "grad_norm": 3.3412492275238037, - "learning_rate": 3.002277800131171e-06, - "loss": 0.076, - "step": 894 - }, - { - "epoch": 4.365853658536586, - "grad_norm": 3.067330837249756, - "learning_rate": 2.998524121643298e-06, - "loss": 0.1724, - "step": 895 - }, - { - "epoch": 4.3707317073170735, - "grad_norm": 3.9015982151031494, - "learning_rate": 2.994769272367822e-06, - "loss": 0.2, - "step": 896 - }, - { - "epoch": 4.375609756097561, - "grad_norm": 3.0136911869049072, - "learning_rate": 2.991013261123035e-06, - "loss": 0.0852, - "step": 897 - }, - { - "epoch": 4.380487804878049, - "grad_norm": 3.6834237575531006, - "learning_rate": 2.9872560967299554e-06, - "loss": 0.1449, - "step": 898 - }, - { - "epoch": 4.385365853658537, - "grad_norm": 3.3486039638519287, - "learning_rate": 2.9834977880123132e-06, - "loss": 0.0659, - "step": 899 - }, - { - "epoch": 4.390243902439025, - "grad_norm": 2.971315622329712, - "learning_rate": 2.9797383437965243e-06, - "loss": 0.1114, - "step": 900 - }, - { - "epoch": 4.3951219512195125, - "grad_norm": 2.683359146118164, - "learning_rate": 2.975977772911671e-06, - "loss": 0.0822, - "step": 901 - }, - { - "epoch": 4.4, - "grad_norm": 2.9941935539245605, - "learning_rate": 2.972216084189482e-06, - "loss": 0.0858, - "step": 902 - }, - { - "epoch": 4.404878048780488, - "grad_norm": 2.4938626289367676, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.1162, - "step": 903 - }, - { - "epoch": 4.409756097560976, - "grad_norm": 2.9364712238311768, - "learning_rate": 2.964689388573118e-06, - "loss": 0.0821, - "step": 904 - }, - { - "epoch": 4.414634146341464, - "grad_norm": 3.3638134002685547, - "learning_rate": 2.9609243993554434e-06, - "loss": 0.25, - "step": 905 - }, - { - "epoch": 4.419512195121952, - "grad_norm": 3.657277822494507, - "learning_rate": 2.9571583276533923e-06, - "loss": 0.0852, - "step": 906 - }, - { - "epoch": 4.424390243902439, - "grad_norm": 5.486263275146484, - "learning_rate": 2.9533911823116124e-06, - "loss": 0.5123, - "step": 907 - }, - { - "epoch": 4.429268292682927, - "grad_norm": 5.194574356079102, - "learning_rate": 2.9496229721772734e-06, - "loss": 0.1854, - "step": 908 - }, - { - "epoch": 4.434146341463415, - "grad_norm": 3.520110845565796, - "learning_rate": 2.9458537061000435e-06, - "loss": 0.1785, - "step": 909 - }, - { - "epoch": 4.439024390243903, - "grad_norm": 3.417991876602173, - "learning_rate": 2.9420833929320726e-06, - "loss": 0.1603, - "step": 910 - }, - { - "epoch": 4.443902439024391, - "grad_norm": 5.225805282592773, - "learning_rate": 2.93831204152797e-06, - "loss": 0.3046, - "step": 911 - }, - { - "epoch": 4.4487804878048784, - "grad_norm": 3.541433572769165, - "learning_rate": 2.9345396607447807e-06, - "loss": 0.0631, - "step": 912 - }, - { - "epoch": 4.453658536585366, - "grad_norm": 3.909377098083496, - "learning_rate": 2.9307662594419704e-06, - "loss": 0.125, - "step": 913 - }, - { - "epoch": 4.458536585365854, - "grad_norm": 3.6604416370391846, - "learning_rate": 2.9269918464814e-06, - "loss": 0.156, - "step": 914 - }, - { - "epoch": 4.463414634146342, - "grad_norm": 3.7413833141326904, - "learning_rate": 2.923216430727306e-06, - "loss": 0.3334, - "step": 915 - }, - { - "epoch": 4.46829268292683, - "grad_norm": 3.531996011734009, - "learning_rate": 2.9194400210462808e-06, - "loss": 0.2534, - "step": 916 - }, - { - "epoch": 4.473170731707317, - "grad_norm": 4.163621425628662, - "learning_rate": 2.91566262630725e-06, - "loss": 0.352, - "step": 917 - }, - { - "epoch": 4.478048780487805, - "grad_norm": 3.923635482788086, - "learning_rate": 2.9118842553814526e-06, - "loss": 0.1132, - "step": 918 - }, - { - "epoch": 4.482926829268292, - "grad_norm": 2.833768844604492, - "learning_rate": 2.9081049171424223e-06, - "loss": 0.086, - "step": 919 - }, - { - "epoch": 4.487804878048781, - "grad_norm": 2.9006292819976807, - "learning_rate": 2.9043246204659624e-06, - "loss": 0.0693, - "step": 920 - }, - { - "epoch": 4.492682926829268, - "grad_norm": 3.699376344680786, - "learning_rate": 2.9005433742301274e-06, - "loss": 0.2463, - "step": 921 - }, - { - "epoch": 4.4975609756097565, - "grad_norm": 4.882141590118408, - "learning_rate": 2.8967611873152037e-06, - "loss": 0.2275, - "step": 922 - }, - { - "epoch": 4.5024390243902435, - "grad_norm": 3.0554678440093994, - "learning_rate": 2.892978068603683e-06, - "loss": 0.0752, - "step": 923 - }, - { - "epoch": 4.507317073170732, - "grad_norm": 3.1225268840789795, - "learning_rate": 2.889194026980249e-06, - "loss": 0.1649, - "step": 924 - }, - { - "epoch": 4.512195121951219, - "grad_norm": 17.75234031677246, - "learning_rate": 2.8854090713317514e-06, - "loss": 0.0437, - "step": 925 - }, - { - "epoch": 4.517073170731708, - "grad_norm": 3.011223554611206, - "learning_rate": 2.8816232105471864e-06, - "loss": 0.0747, - "step": 926 - }, - { - "epoch": 4.521951219512195, - "grad_norm": 4.327573299407959, - "learning_rate": 2.877836453517677e-06, - "loss": 0.3884, - "step": 927 - }, - { - "epoch": 4.526829268292683, - "grad_norm": 3.8694965839385986, - "learning_rate": 2.8740488091364492e-06, - "loss": 0.2741, - "step": 928 - }, - { - "epoch": 4.53170731707317, - "grad_norm": 5.375877380371094, - "learning_rate": 2.870260286298814e-06, - "loss": 0.364, - "step": 929 - }, - { - "epoch": 4.536585365853659, - "grad_norm": 3.380891799926758, - "learning_rate": 2.866470893902147e-06, - "loss": 0.1495, - "step": 930 - }, - { - "epoch": 4.541463414634146, - "grad_norm": 3.723992109298706, - "learning_rate": 2.8626806408458626e-06, - "loss": 0.1403, - "step": 931 - }, - { - "epoch": 4.546341463414635, - "grad_norm": 3.0534417629241943, - "learning_rate": 2.8588895360313983e-06, - "loss": 0.0946, - "step": 932 - }, - { - "epoch": 4.5512195121951216, - "grad_norm": 2.8875234127044678, - "learning_rate": 2.8550975883621935e-06, - "loss": 0.1851, - "step": 933 - }, - { - "epoch": 4.55609756097561, - "grad_norm": 3.532166004180908, - "learning_rate": 2.8513048067436644e-06, - "loss": 0.178, - "step": 934 - }, - { - "epoch": 4.560975609756097, - "grad_norm": 2.942798376083374, - "learning_rate": 2.847511200083187e-06, - "loss": 0.1131, - "step": 935 - }, - { - "epoch": 4.565853658536585, - "grad_norm": 2.926874876022339, - "learning_rate": 2.843716777290074e-06, - "loss": 0.1251, - "step": 936 - }, - { - "epoch": 4.570731707317073, - "grad_norm": 3.525895357131958, - "learning_rate": 2.839921547275556e-06, - "loss": 0.0946, - "step": 937 - }, - { - "epoch": 4.575609756097561, - "grad_norm": 3.7033681869506836, - "learning_rate": 2.836125518952759e-06, - "loss": 0.1529, - "step": 938 - }, - { - "epoch": 4.580487804878048, - "grad_norm": 3.235154867172241, - "learning_rate": 2.8323287012366845e-06, - "loss": 0.2511, - "step": 939 - }, - { - "epoch": 4.585365853658536, - "grad_norm": 3.5275583267211914, - "learning_rate": 2.828531103044186e-06, - "loss": 0.1474, - "step": 940 - }, - { - "epoch": 4.590243902439024, - "grad_norm": 3.1356353759765625, - "learning_rate": 2.8247327332939512e-06, - "loss": 0.2249, - "step": 941 - }, - { - "epoch": 4.595121951219512, - "grad_norm": 3.789210081100464, - "learning_rate": 2.82093360090648e-06, - "loss": 0.2258, - "step": 942 - }, - { - "epoch": 4.6, - "grad_norm": 4.841623306274414, - "learning_rate": 2.8171337148040636e-06, - "loss": 0.2235, - "step": 943 - }, - { - "epoch": 4.6048780487804875, - "grad_norm": 3.161630630493164, - "learning_rate": 2.813333083910761e-06, - "loss": 0.1562, - "step": 944 - }, - { - "epoch": 4.609756097560975, - "grad_norm": 2.8718132972717285, - "learning_rate": 2.8095317171523835e-06, - "loss": 0.0625, - "step": 945 - }, - { - "epoch": 4.614634146341463, - "grad_norm": 3.6432454586029053, - "learning_rate": 2.805729623456469e-06, - "loss": 0.2205, - "step": 946 - }, - { - "epoch": 4.619512195121951, - "grad_norm": 4.382034778594971, - "learning_rate": 2.8019268117522624e-06, - "loss": 0.3241, - "step": 947 - }, - { - "epoch": 4.624390243902439, - "grad_norm": 3.2998175621032715, - "learning_rate": 2.798123290970695e-06, - "loss": 0.1983, - "step": 948 - }, - { - "epoch": 4.6292682926829265, - "grad_norm": 3.8665990829467773, - "learning_rate": 2.794319070044365e-06, - "loss": 0.3391, - "step": 949 - }, - { - "epoch": 4.634146341463414, - "grad_norm": 3.628403425216675, - "learning_rate": 2.790514157907512e-06, - "loss": 0.1329, - "step": 950 - }, - { - "epoch": 4.639024390243902, - "grad_norm": 2.8889615535736084, - "learning_rate": 2.786708563496002e-06, - "loss": 0.141, - "step": 951 - }, - { - "epoch": 4.64390243902439, - "grad_norm": 4.07351541519165, - "learning_rate": 2.782902295747299e-06, - "loss": 0.2935, - "step": 952 - }, - { - "epoch": 4.648780487804878, - "grad_norm": 4.220067024230957, - "learning_rate": 2.7790953636004536e-06, - "loss": 0.318, - "step": 953 - }, - { - "epoch": 4.6536585365853655, - "grad_norm": 3.8444325923919678, - "learning_rate": 2.775287775996074e-06, - "loss": 0.3388, - "step": 954 - }, - { - "epoch": 4.658536585365853, - "grad_norm": 3.197313070297241, - "learning_rate": 2.7714795418763067e-06, - "loss": 0.0925, - "step": 955 - }, - { - "epoch": 4.663414634146341, - "grad_norm": 4.0050811767578125, - "learning_rate": 2.7676706701848187e-06, - "loss": 0.2811, - "step": 956 - }, - { - "epoch": 4.668292682926829, - "grad_norm": 3.217160224914551, - "learning_rate": 2.763861169866774e-06, - "loss": 0.311, - "step": 957 - }, - { - "epoch": 4.673170731707317, - "grad_norm": 2.9892494678497314, - "learning_rate": 2.7600510498688104e-06, - "loss": 0.0582, - "step": 958 - }, - { - "epoch": 4.678048780487805, - "grad_norm": 3.954805374145508, - "learning_rate": 2.7562403191390246e-06, - "loss": 0.1238, - "step": 959 - }, - { - "epoch": 4.682926829268292, - "grad_norm": 2.9582695960998535, - "learning_rate": 2.7524289866269467e-06, - "loss": 0.1243, - "step": 960 - }, - { - "epoch": 4.68780487804878, - "grad_norm": 2.807002544403076, - "learning_rate": 2.748617061283518e-06, - "loss": 0.1388, - "step": 961 - }, - { - "epoch": 4.692682926829268, - "grad_norm": 3.980499505996704, - "learning_rate": 2.744804552061074e-06, - "loss": 0.1144, - "step": 962 - }, - { - "epoch": 4.697560975609756, - "grad_norm": 3.6389007568359375, - "learning_rate": 2.740991467913321e-06, - "loss": 0.2155, - "step": 963 - }, - { - "epoch": 4.702439024390244, - "grad_norm": 3.0950801372528076, - "learning_rate": 2.737177817795315e-06, - "loss": 0.0983, - "step": 964 - }, - { - "epoch": 4.7073170731707314, - "grad_norm": 3.1723053455352783, - "learning_rate": 2.7333636106634414e-06, - "loss": 0.1365, - "step": 965 - }, - { - "epoch": 4.712195121951219, - "grad_norm": 3.83921217918396, - "learning_rate": 2.7295488554753957e-06, - "loss": 0.1977, - "step": 966 - }, - { - "epoch": 4.717073170731707, - "grad_norm": 3.348057746887207, - "learning_rate": 2.725733561190157e-06, - "loss": 0.1311, - "step": 967 - }, - { - "epoch": 4.721951219512195, - "grad_norm": 3.828483819961548, - "learning_rate": 2.721917736767973e-06, - "loss": 0.2464, - "step": 968 - }, - { - "epoch": 4.726829268292683, - "grad_norm": 2.6004624366760254, - "learning_rate": 2.7181013911703357e-06, - "loss": 0.1088, - "step": 969 - }, - { - "epoch": 4.7317073170731705, - "grad_norm": 3.316990852355957, - "learning_rate": 2.714284533359961e-06, - "loss": 0.1492, - "step": 970 - }, - { - "epoch": 4.736585365853658, - "grad_norm": 3.8770010471343994, - "learning_rate": 2.710467172300768e-06, - "loss": 0.218, - "step": 971 - }, - { - "epoch": 4.741463414634146, - "grad_norm": 4.456376552581787, - "learning_rate": 2.706649316957857e-06, - "loss": 0.2199, - "step": 972 - }, - { - "epoch": 4.746341463414634, - "grad_norm": 3.3376309871673584, - "learning_rate": 2.7028309762974897e-06, - "loss": 0.0595, - "step": 973 - }, - { - "epoch": 4.751219512195122, - "grad_norm": 3.6755495071411133, - "learning_rate": 2.699012159287069e-06, - "loss": 0.1653, - "step": 974 - }, - { - "epoch": 4.7560975609756095, - "grad_norm": 2.939887046813965, - "learning_rate": 2.6951928748951125e-06, - "loss": 0.0681, - "step": 975 - }, - { - "epoch": 4.760975609756097, - "grad_norm": 3.4101195335388184, - "learning_rate": 2.69137313209124e-06, - "loss": 0.2046, - "step": 976 - }, - { - "epoch": 4.765853658536585, - "grad_norm": 3.9811208248138428, - "learning_rate": 2.687552939846145e-06, - "loss": 0.2255, - "step": 977 - }, - { - "epoch": 4.770731707317073, - "grad_norm": 3.484255313873291, - "learning_rate": 2.6837323071315766e-06, - "loss": 0.0512, - "step": 978 - }, - { - "epoch": 4.775609756097561, - "grad_norm": 3.9005143642425537, - "learning_rate": 2.679911242920321e-06, - "loss": 0.162, - "step": 979 - }, - { - "epoch": 4.780487804878049, - "grad_norm": 4.933374881744385, - "learning_rate": 2.6760897561861742e-06, - "loss": 0.398, - "step": 980 - }, - { - "epoch": 4.785365853658536, - "grad_norm": 3.0741539001464844, - "learning_rate": 2.672267855903927e-06, - "loss": 0.0507, - "step": 981 - }, - { - "epoch": 4.790243902439024, - "grad_norm": 3.023772716522217, - "learning_rate": 2.6684455510493413e-06, - "loss": 0.2066, - "step": 982 - }, - { - "epoch": 4.795121951219512, - "grad_norm": 3.0102407932281494, - "learning_rate": 2.6646228505991267e-06, - "loss": 0.2296, - "step": 983 - }, - { - "epoch": 4.8, - "grad_norm": 3.902200222015381, - "learning_rate": 2.6607997635309246e-06, - "loss": 0.14, - "step": 984 - }, - { - "epoch": 4.804878048780488, - "grad_norm": 3.836185932159424, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.1583, - "step": 985 - }, - { - "epoch": 4.809756097560975, - "grad_norm": 3.539628744125366, - "learning_rate": 2.653152465455639e-06, - "loss": 0.2619, - "step": 986 - }, - { - "epoch": 4.814634146341463, - "grad_norm": 4.716914653778076, - "learning_rate": 2.6493282724082913e-06, - "loss": 0.3029, - "step": 987 - }, - { - "epoch": 4.819512195121951, - "grad_norm": 3.466914176940918, - "learning_rate": 2.6455037286623864e-06, - "loss": 0.095, - "step": 988 - }, - { - "epoch": 4.824390243902439, - "grad_norm": 2.1798667907714844, - "learning_rate": 2.6416788431998935e-06, - "loss": 0.1232, - "step": 989 - }, - { - "epoch": 4.829268292682927, - "grad_norm": 3.309039354324341, - "learning_rate": 2.637853625003585e-06, - "loss": 0.3671, - "step": 990 - }, - { - "epoch": 4.8341463414634145, - "grad_norm": 3.2619435787200928, - "learning_rate": 2.6340280830570142e-06, - "loss": 0.194, - "step": 991 - }, - { - "epoch": 4.839024390243902, - "grad_norm": 3.601161003112793, - "learning_rate": 2.6302022263444947e-06, - "loss": 0.1214, - "step": 992 - }, - { - "epoch": 4.84390243902439, - "grad_norm": 4.13787841796875, - "learning_rate": 2.6263760638510793e-06, - "loss": 0.311, - "step": 993 - }, - { - "epoch": 4.848780487804878, - "grad_norm": 3.0474166870117188, - "learning_rate": 2.6225496045625394e-06, - "loss": 0.1853, - "step": 994 - }, - { - "epoch": 4.853658536585366, - "grad_norm": 4.481237411499023, - "learning_rate": 2.6187228574653428e-06, - "loss": 0.2088, - "step": 995 - }, - { - "epoch": 4.8585365853658535, - "grad_norm": 3.235966444015503, - "learning_rate": 2.614895831546633e-06, - "loss": 0.1439, - "step": 996 - }, - { - "epoch": 4.863414634146341, - "grad_norm": 4.103270053863525, - "learning_rate": 2.6110685357942096e-06, - "loss": 0.2823, - "step": 997 - }, - { - "epoch": 4.868292682926829, - "grad_norm": 4.134536266326904, - "learning_rate": 2.6072409791965048e-06, - "loss": 0.2963, - "step": 998 - }, - { - "epoch": 4.873170731707317, - "grad_norm": 4.124892711639404, - "learning_rate": 2.6034131707425638e-06, - "loss": 0.4127, - "step": 999 - }, - { - "epoch": 4.878048780487805, - "grad_norm": 3.565139055252075, - "learning_rate": 2.5995851194220223e-06, - "loss": 0.1601, - "step": 1000 - }, - { - "epoch": 4.882926829268293, - "grad_norm": 2.7548017501831055, - "learning_rate": 2.595756834225089e-06, - "loss": 0.161, - "step": 1001 - }, - { - "epoch": 4.88780487804878, - "grad_norm": 3.9297611713409424, - "learning_rate": 2.5919283241425188e-06, - "loss": 0.1013, - "step": 1002 - }, - { - "epoch": 4.892682926829268, - "grad_norm": 2.4904236793518066, - "learning_rate": 2.5880995981655965e-06, - "loss": 0.1177, - "step": 1003 - }, - { - "epoch": 4.897560975609756, - "grad_norm": 3.513308048248291, - "learning_rate": 2.584270665286113e-06, - "loss": 0.0682, - "step": 1004 - }, - { - "epoch": 4.902439024390244, - "grad_norm": 4.221067428588867, - "learning_rate": 2.580441534496346e-06, - "loss": 0.1502, - "step": 1005 - }, - { - "epoch": 4.907317073170732, - "grad_norm": 3.4298903942108154, - "learning_rate": 2.576612214789039e-06, - "loss": 0.1772, - "step": 1006 - }, - { - "epoch": 4.912195121951219, - "grad_norm": 4.402887344360352, - "learning_rate": 2.5727827151573747e-06, - "loss": 0.2029, - "step": 1007 - }, - { - "epoch": 4.917073170731707, - "grad_norm": 4.194999694824219, - "learning_rate": 2.568953044594964e-06, - "loss": 0.1269, - "step": 1008 - }, - { - "epoch": 4.921951219512195, - "grad_norm": 3.657607078552246, - "learning_rate": 2.5651232120958157e-06, - "loss": 0.1311, - "step": 1009 - }, - { - "epoch": 4.926829268292683, - "grad_norm": 4.092184543609619, - "learning_rate": 2.56129322665432e-06, - "loss": 0.1085, - "step": 1010 - }, - { - "epoch": 4.931707317073171, - "grad_norm": 3.3648242950439453, - "learning_rate": 2.5574630972652263e-06, - "loss": 0.0782, - "step": 1011 - }, - { - "epoch": 4.9365853658536585, - "grad_norm": 3.7215166091918945, - "learning_rate": 2.553632832923622e-06, - "loss": 0.1391, - "step": 1012 - }, - { - "epoch": 4.941463414634146, - "grad_norm": 4.045740127563477, - "learning_rate": 2.5498024426249107e-06, - "loss": 0.3141, - "step": 1013 - }, - { - "epoch": 4.946341463414634, - "grad_norm": 3.2363107204437256, - "learning_rate": 2.545971935364794e-06, - "loss": 0.0679, - "step": 1014 - }, - { - "epoch": 4.951219512195122, - "grad_norm": 3.057283639907837, - "learning_rate": 2.5421413201392443e-06, - "loss": 0.1382, - "step": 1015 - }, - { - "epoch": 4.95609756097561, - "grad_norm": 3.591535806655884, - "learning_rate": 2.538310605944491e-06, - "loss": 0.112, - "step": 1016 - }, - { - "epoch": 4.9609756097560975, - "grad_norm": 3.1629281044006348, - "learning_rate": 2.534479801776996e-06, - "loss": 0.1261, - "step": 1017 - }, - { - "epoch": 4.965853658536585, - "grad_norm": 2.691740036010742, - "learning_rate": 2.53064891663343e-06, - "loss": 0.2328, - "step": 1018 - }, - { - "epoch": 4.970731707317073, - "grad_norm": 3.2620503902435303, - "learning_rate": 2.526817959510655e-06, - "loss": 0.193, - "step": 1019 - }, - { - "epoch": 4.975609756097561, - "grad_norm": 3.0721535682678223, - "learning_rate": 2.5229869394057038e-06, - "loss": 0.2444, - "step": 1020 - }, - { - "epoch": 4.980487804878049, - "grad_norm": 2.6279208660125732, - "learning_rate": 2.5191558653157542e-06, - "loss": 0.1103, - "step": 1021 - }, - { - "epoch": 4.985365853658537, - "grad_norm": 2.9295670986175537, - "learning_rate": 2.515324746238113e-06, - "loss": 0.0553, - "step": 1022 - }, - { - "epoch": 4.990243902439024, - "grad_norm": 3.3960084915161133, - "learning_rate": 2.511493591170191e-06, - "loss": 0.1686, - "step": 1023 - }, - { - "epoch": 4.995121951219512, - "grad_norm": 4.138705253601074, - "learning_rate": 2.5076624091094846e-06, - "loss": 0.1208, - "step": 1024 - }, - { - "epoch": 5.0, - "grad_norm": 2.603870391845703, - "learning_rate": 2.503831209053554e-06, - "loss": 0.1216, - "step": 1025 - }, - { - "epoch": 5.004878048780488, - "grad_norm": 2.525205612182617, - "learning_rate": 2.5e-06, - "loss": 0.0984, - "step": 1026 - }, - { - "epoch": 5.009756097560976, - "grad_norm": 3.2502501010894775, - "learning_rate": 2.4961687909464462e-06, - "loss": 0.1323, - "step": 1027 - }, - { - "epoch": 5.014634146341463, - "grad_norm": 5.363409519195557, - "learning_rate": 2.492337590890516e-06, - "loss": 0.3516, - "step": 1028 - }, - { - "epoch": 5.019512195121951, - "grad_norm": 2.887723445892334, - "learning_rate": 2.4885064088298097e-06, - "loss": 0.1931, - "step": 1029 - }, - { - "epoch": 5.024390243902439, - "grad_norm": 3.4529435634613037, - "learning_rate": 2.4846752537618875e-06, - "loss": 0.0675, - "step": 1030 - }, - { - "epoch": 5.029268292682927, - "grad_norm": 4.202361106872559, - "learning_rate": 2.480844134684246e-06, - "loss": 0.1643, - "step": 1031 - }, - { - "epoch": 5.034146341463415, - "grad_norm": 2.910275459289551, - "learning_rate": 2.4770130605942966e-06, - "loss": 0.11, - "step": 1032 - }, - { - "epoch": 5.0390243902439025, - "grad_norm": 3.5430362224578857, - "learning_rate": 2.4731820404893457e-06, - "loss": 0.0614, - "step": 1033 - }, - { - "epoch": 5.04390243902439, - "grad_norm": 4.501879692077637, - "learning_rate": 2.469351083366571e-06, - "loss": 0.0954, - "step": 1034 - }, - { - "epoch": 5.048780487804878, - "grad_norm": 2.732261896133423, - "learning_rate": 2.4655201982230044e-06, - "loss": 0.0275, - "step": 1035 - }, - { - "epoch": 5.053658536585366, - "grad_norm": 3.5926437377929688, - "learning_rate": 2.4616893940555094e-06, - "loss": 0.0661, - "step": 1036 - }, - { - "epoch": 5.058536585365854, - "grad_norm": 4.790312767028809, - "learning_rate": 2.457858679860757e-06, - "loss": 0.2976, - "step": 1037 - }, - { - "epoch": 5.0634146341463415, - "grad_norm": 4.453246116638184, - "learning_rate": 2.4540280646352072e-06, - "loss": 0.1216, - "step": 1038 - }, - { - "epoch": 5.068292682926829, - "grad_norm": 3.288011074066162, - "learning_rate": 2.45019755737509e-06, - "loss": 0.0877, - "step": 1039 - }, - { - "epoch": 5.073170731707317, - "grad_norm": 3.566927671432495, - "learning_rate": 2.4463671670763787e-06, - "loss": 0.1661, - "step": 1040 - }, - { - "epoch": 5.078048780487805, - "grad_norm": 3.250047206878662, - "learning_rate": 2.4425369027347746e-06, - "loss": 0.211, - "step": 1041 - }, - { - "epoch": 5.082926829268293, - "grad_norm": 3.0214977264404297, - "learning_rate": 2.4387067733456804e-06, - "loss": 0.093, - "step": 1042 - }, - { - "epoch": 5.087804878048781, - "grad_norm": 3.8162097930908203, - "learning_rate": 2.4348767879041847e-06, - "loss": 0.0777, - "step": 1043 - }, - { - "epoch": 5.092682926829268, - "grad_norm": 3.8071560859680176, - "learning_rate": 2.4310469554050366e-06, - "loss": 0.087, - "step": 1044 - }, - { - "epoch": 5.097560975609756, - "grad_norm": 3.1032073497772217, - "learning_rate": 2.4272172848426257e-06, - "loss": 0.1105, - "step": 1045 - }, - { - "epoch": 5.102439024390244, - "grad_norm": 2.8980185985565186, - "learning_rate": 2.423387785210962e-06, - "loss": 0.0704, - "step": 1046 - }, - { - "epoch": 5.107317073170732, - "grad_norm": 3.9110755920410156, - "learning_rate": 2.4195584655036544e-06, - "loss": 0.2118, - "step": 1047 - }, - { - "epoch": 5.11219512195122, - "grad_norm": 2.678884506225586, - "learning_rate": 2.4157293347138877e-06, - "loss": 0.0664, - "step": 1048 - }, - { - "epoch": 5.117073170731707, - "grad_norm": 3.183046340942383, - "learning_rate": 2.4119004018344043e-06, - "loss": 0.1767, - "step": 1049 - }, - { - "epoch": 5.121951219512195, - "grad_norm": 3.9198925495147705, - "learning_rate": 2.408071675857482e-06, - "loss": 0.1288, - "step": 1050 - }, - { - "epoch": 5.126829268292683, - "grad_norm": 4.378621578216553, - "learning_rate": 2.404243165774912e-06, - "loss": 0.1724, - "step": 1051 - }, - { - "epoch": 5.131707317073171, - "grad_norm": 2.5509133338928223, - "learning_rate": 2.4004148805779785e-06, - "loss": 0.0382, - "step": 1052 - }, - { - "epoch": 5.136585365853659, - "grad_norm": 3.692396402359009, - "learning_rate": 2.3965868292574375e-06, - "loss": 0.0942, - "step": 1053 - }, - { - "epoch": 5.1414634146341465, - "grad_norm": 3.8537800312042236, - "learning_rate": 2.392759020803496e-06, - "loss": 0.0819, - "step": 1054 - }, - { - "epoch": 5.146341463414634, - "grad_norm": 4.02876091003418, - "learning_rate": 2.3889314642057916e-06, - "loss": 0.0866, - "step": 1055 - }, - { - "epoch": 5.151219512195122, - "grad_norm": 3.531857490539551, - "learning_rate": 2.3851041684533677e-06, - "loss": 0.1557, - "step": 1056 - }, - { - "epoch": 5.15609756097561, - "grad_norm": 2.231265068054199, - "learning_rate": 2.381277142534658e-06, - "loss": 0.0421, - "step": 1057 - }, - { - "epoch": 5.160975609756098, - "grad_norm": 3.159226894378662, - "learning_rate": 2.3774503954374614e-06, - "loss": 0.0395, - "step": 1058 - }, - { - "epoch": 5.1658536585365855, - "grad_norm": 3.0375123023986816, - "learning_rate": 2.373623936148921e-06, - "loss": 0.1869, - "step": 1059 - }, - { - "epoch": 5.170731707317073, - "grad_norm": 5.4905900955200195, - "learning_rate": 2.369797773655506e-06, - "loss": 0.1426, - "step": 1060 - }, - { - "epoch": 5.175609756097561, - "grad_norm": 2.8739638328552246, - "learning_rate": 2.3659719169429866e-06, - "loss": 0.0788, - "step": 1061 - }, - { - "epoch": 5.180487804878049, - "grad_norm": 2.612183094024658, - "learning_rate": 2.3621463749964153e-06, - "loss": 0.0449, - "step": 1062 - }, - { - "epoch": 5.185365853658537, - "grad_norm": 2.0573198795318604, - "learning_rate": 2.3583211568001073e-06, - "loss": 0.0264, - "step": 1063 - }, - { - "epoch": 5.190243902439025, - "grad_norm": 2.3667244911193848, - "learning_rate": 2.3544962713376144e-06, - "loss": 0.0507, - "step": 1064 - }, - { - "epoch": 5.195121951219512, - "grad_norm": 2.1223740577697754, - "learning_rate": 2.3506717275917095e-06, - "loss": 0.0576, - "step": 1065 - }, - { - "epoch": 5.2, - "grad_norm": 2.2630319595336914, - "learning_rate": 2.346847534544362e-06, - "loss": 0.0523, - "step": 1066 - }, - { - "epoch": 5.204878048780488, - "grad_norm": 3.201913595199585, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.0847, - "step": 1067 - }, - { - "epoch": 5.209756097560976, - "grad_norm": 2.2149481773376465, - "learning_rate": 2.3392002364690762e-06, - "loss": 0.0215, - "step": 1068 - }, - { - "epoch": 5.214634146341464, - "grad_norm": 4.425244331359863, - "learning_rate": 2.335377149400874e-06, - "loss": 0.1018, - "step": 1069 - }, - { - "epoch": 5.219512195121951, - "grad_norm": 4.548358917236328, - "learning_rate": 2.3315544489506596e-06, - "loss": 0.1485, - "step": 1070 - }, - { - "epoch": 5.224390243902439, - "grad_norm": 3.635796546936035, - "learning_rate": 2.3277321440960733e-06, - "loss": 0.111, - "step": 1071 - }, - { - "epoch": 5.229268292682927, - "grad_norm": 2.3180043697357178, - "learning_rate": 2.323910243813826e-06, - "loss": 0.0267, - "step": 1072 - }, - { - "epoch": 5.234146341463415, - "grad_norm": 3.675490379333496, - "learning_rate": 2.3200887570796798e-06, - "loss": 0.153, - "step": 1073 - }, - { - "epoch": 5.239024390243903, - "grad_norm": 2.883225202560425, - "learning_rate": 2.316267692868424e-06, - "loss": 0.0968, - "step": 1074 - }, - { - "epoch": 5.2439024390243905, - "grad_norm": 3.0320188999176025, - "learning_rate": 2.312447060153856e-06, - "loss": 0.0786, - "step": 1075 - }, - { - "epoch": 5.248780487804878, - "grad_norm": 2.682695150375366, - "learning_rate": 2.308626867908761e-06, - "loss": 0.0677, - "step": 1076 - }, - { - "epoch": 5.253658536585366, - "grad_norm": 3.941967010498047, - "learning_rate": 2.3048071251048884e-06, - "loss": 0.1059, - "step": 1077 - }, - { - "epoch": 5.258536585365854, - "grad_norm": 6.485599517822266, - "learning_rate": 2.300987840712932e-06, - "loss": 0.1331, - "step": 1078 - }, - { - "epoch": 5.263414634146342, - "grad_norm": 3.809269905090332, - "learning_rate": 2.297169023702511e-06, - "loss": 0.169, - "step": 1079 - }, - { - "epoch": 5.2682926829268295, - "grad_norm": 3.115626573562622, - "learning_rate": 2.2933506830421436e-06, - "loss": 0.1349, - "step": 1080 - }, - { - "epoch": 5.273170731707317, - "grad_norm": 2.2234909534454346, - "learning_rate": 2.2895328276992325e-06, - "loss": 0.0191, - "step": 1081 - }, - { - "epoch": 5.278048780487805, - "grad_norm": 3.896925926208496, - "learning_rate": 2.28571546664004e-06, - "loss": 0.1961, - "step": 1082 - }, - { - "epoch": 5.282926829268293, - "grad_norm": 2.4134509563446045, - "learning_rate": 2.281898608829665e-06, - "loss": 0.02, - "step": 1083 - }, - { - "epoch": 5.287804878048781, - "grad_norm": 2.7599191665649414, - "learning_rate": 2.2780822632320273e-06, - "loss": 0.0763, - "step": 1084 - }, - { - "epoch": 5.2926829268292686, - "grad_norm": 2.465637683868408, - "learning_rate": 2.2742664388098435e-06, - "loss": 0.0403, - "step": 1085 - }, - { - "epoch": 5.297560975609756, - "grad_norm": 2.4026618003845215, - "learning_rate": 2.270451144524605e-06, - "loss": 0.0982, - "step": 1086 - }, - { - "epoch": 5.302439024390244, - "grad_norm": 3.3339459896087646, - "learning_rate": 2.266636389336559e-06, - "loss": 0.09, - "step": 1087 - }, - { - "epoch": 5.307317073170732, - "grad_norm": 2.113255023956299, - "learning_rate": 2.262822182204686e-06, - "loss": 0.0267, - "step": 1088 - }, - { - "epoch": 5.31219512195122, - "grad_norm": 3.1760852336883545, - "learning_rate": 2.2590085320866798e-06, - "loss": 0.0295, - "step": 1089 - }, - { - "epoch": 5.317073170731708, - "grad_norm": 2.9674434661865234, - "learning_rate": 2.255195447938927e-06, - "loss": 0.0261, - "step": 1090 - }, - { - "epoch": 5.321951219512195, - "grad_norm": 3.4384074211120605, - "learning_rate": 2.251382938716482e-06, - "loss": 0.0936, - "step": 1091 - }, - { - "epoch": 5.326829268292683, - "grad_norm": 3.3814568519592285, - "learning_rate": 2.2475710133730533e-06, - "loss": 0.0426, - "step": 1092 - }, - { - "epoch": 5.331707317073171, - "grad_norm": 3.081317663192749, - "learning_rate": 2.243759680860975e-06, - "loss": 0.0799, - "step": 1093 - }, - { - "epoch": 5.336585365853659, - "grad_norm": 3.5608482360839844, - "learning_rate": 2.2399489501311896e-06, - "loss": 0.0906, - "step": 1094 - }, - { - "epoch": 5.341463414634147, - "grad_norm": 3.7886314392089844, - "learning_rate": 2.2361388301332265e-06, - "loss": 0.2152, - "step": 1095 - }, - { - "epoch": 5.3463414634146345, - "grad_norm": 1.9531102180480957, - "learning_rate": 2.2323293298151817e-06, - "loss": 0.0359, - "step": 1096 - }, - { - "epoch": 5.351219512195122, - "grad_norm": 2.2828023433685303, - "learning_rate": 2.2285204581236937e-06, - "loss": 0.0368, - "step": 1097 - }, - { - "epoch": 5.35609756097561, - "grad_norm": 3.110262870788574, - "learning_rate": 2.2247122240039268e-06, - "loss": 0.0426, - "step": 1098 - }, - { - "epoch": 5.360975609756098, - "grad_norm": 2.3293566703796387, - "learning_rate": 2.2209046363995464e-06, - "loss": 0.0223, - "step": 1099 - }, - { - "epoch": 5.365853658536586, - "grad_norm": 2.990884780883789, - "learning_rate": 2.217097704252701e-06, - "loss": 0.1276, - "step": 1100 - }, - { - "epoch": 5.3707317073170735, - "grad_norm": 2.568014144897461, - "learning_rate": 2.2132914365039993e-06, - "loss": 0.0639, - "step": 1101 - }, - { - "epoch": 5.375609756097561, - "grad_norm": 2.618478536605835, - "learning_rate": 2.2094858420924882e-06, - "loss": 0.0166, - "step": 1102 - }, - { - "epoch": 5.380487804878049, - "grad_norm": 4.526919364929199, - "learning_rate": 2.205680929955635e-06, - "loss": 0.144, - "step": 1103 - }, - { - "epoch": 5.385365853658537, - "grad_norm": 2.7236886024475098, - "learning_rate": 2.201876709029305e-06, - "loss": 0.1004, - "step": 1104 - }, - { - "epoch": 5.390243902439025, - "grad_norm": 2.1577632427215576, - "learning_rate": 2.198073188247738e-06, - "loss": 0.0453, - "step": 1105 - }, - { - "epoch": 5.3951219512195125, - "grad_norm": 2.5170321464538574, - "learning_rate": 2.1942703765435317e-06, - "loss": 0.0195, - "step": 1106 - }, - { - "epoch": 5.4, - "grad_norm": 3.962658643722534, - "learning_rate": 2.190468282847617e-06, - "loss": 0.1512, - "step": 1107 - }, - { - "epoch": 5.404878048780488, - "grad_norm": 4.297860622406006, - "learning_rate": 2.186666916089239e-06, - "loss": 0.2572, - "step": 1108 - }, - { - "epoch": 5.409756097560976, - "grad_norm": 2.8933565616607666, - "learning_rate": 2.1828662851959377e-06, - "loss": 0.0536, - "step": 1109 - }, - { - "epoch": 5.414634146341464, - "grad_norm": 2.9397451877593994, - "learning_rate": 2.1790663990935203e-06, - "loss": 0.0778, - "step": 1110 - }, - { - "epoch": 5.419512195121952, - "grad_norm": 3.5210094451904297, - "learning_rate": 2.1752672667060488e-06, - "loss": 0.0558, - "step": 1111 - }, - { - "epoch": 5.424390243902439, - "grad_norm": 2.9027626514434814, - "learning_rate": 2.1714688969558146e-06, - "loss": 0.041, - "step": 1112 - }, - { - "epoch": 5.429268292682927, - "grad_norm": 3.7691168785095215, - "learning_rate": 2.167671298763316e-06, - "loss": 0.1644, - "step": 1113 - }, - { - "epoch": 5.434146341463415, - "grad_norm": 3.493008852005005, - "learning_rate": 2.1638744810472414e-06, - "loss": 0.1587, - "step": 1114 - }, - { - "epoch": 5.439024390243903, - "grad_norm": 2.711196184158325, - "learning_rate": 2.1600784527244445e-06, - "loss": 0.0605, - "step": 1115 - }, - { - "epoch": 5.443902439024391, - "grad_norm": 4.365038871765137, - "learning_rate": 2.1562832227099266e-06, - "loss": 0.1897, - "step": 1116 - }, - { - "epoch": 5.4487804878048784, - "grad_norm": 4.621466159820557, - "learning_rate": 2.152488799916814e-06, - "loss": 0.1525, - "step": 1117 - }, - { - "epoch": 5.453658536585366, - "grad_norm": 4.8721089363098145, - "learning_rate": 2.148695193256336e-06, - "loss": 0.189, - "step": 1118 - }, - { - "epoch": 5.458536585365854, - "grad_norm": 2.8999173641204834, - "learning_rate": 2.1449024116378064e-06, - "loss": 0.095, - "step": 1119 - }, - { - "epoch": 5.463414634146342, - "grad_norm": 2.4865314960479736, - "learning_rate": 2.1411104639686013e-06, - "loss": 0.0432, - "step": 1120 - }, - { - "epoch": 5.46829268292683, - "grad_norm": 3.8497228622436523, - "learning_rate": 2.137319359154138e-06, - "loss": 0.0954, - "step": 1121 - }, - { - "epoch": 5.473170731707317, - "grad_norm": 2.3643507957458496, - "learning_rate": 2.133529106097853e-06, - "loss": 0.0362, - "step": 1122 - }, - { - "epoch": 5.478048780487805, - "grad_norm": 3.017826795578003, - "learning_rate": 2.1297397137011862e-06, - "loss": 0.0875, - "step": 1123 - }, - { - "epoch": 5.482926829268292, - "grad_norm": 3.239320755004883, - "learning_rate": 2.125951190863551e-06, - "loss": 0.0758, - "step": 1124 - }, - { - "epoch": 5.487804878048781, - "grad_norm": 2.566241979598999, - "learning_rate": 2.1221635464823237e-06, - "loss": 0.0605, - "step": 1125 - }, - { - "epoch": 5.492682926829268, - "grad_norm": 4.810088157653809, - "learning_rate": 2.1183767894528135e-06, - "loss": 0.2403, - "step": 1126 - }, - { - "epoch": 5.4975609756097565, - "grad_norm": 2.083263397216797, - "learning_rate": 2.114590928668249e-06, - "loss": 0.0223, - "step": 1127 - }, - { - "epoch": 5.5024390243902435, - "grad_norm": 2.6812374591827393, - "learning_rate": 2.1108059730197517e-06, - "loss": 0.0617, - "step": 1128 - }, - { - "epoch": 5.507317073170732, - "grad_norm": 3.196735143661499, - "learning_rate": 2.1070219313963173e-06, - "loss": 0.043, - "step": 1129 - }, - { - "epoch": 5.512195121951219, - "grad_norm": 2.775470495223999, - "learning_rate": 2.1032388126847967e-06, - "loss": 0.0595, - "step": 1130 - }, - { - "epoch": 5.517073170731708, - "grad_norm": 2.8632407188415527, - "learning_rate": 2.099456625769872e-06, - "loss": 0.0186, - "step": 1131 - }, - { - "epoch": 5.521951219512195, - "grad_norm": 4.075018405914307, - "learning_rate": 2.0956753795340376e-06, - "loss": 0.0616, - "step": 1132 - }, - { - "epoch": 5.526829268292683, - "grad_norm": 3.206327199935913, - "learning_rate": 2.091895082857578e-06, - "loss": 0.1895, - "step": 1133 - }, - { - "epoch": 5.53170731707317, - "grad_norm": 2.967588186264038, - "learning_rate": 2.0881157446185474e-06, - "loss": 0.0484, - "step": 1134 - }, - { - "epoch": 5.536585365853659, - "grad_norm": 2.850929021835327, - "learning_rate": 2.0843373736927506e-06, - "loss": 0.037, - "step": 1135 - }, - { - "epoch": 5.541463414634146, - "grad_norm": 2.2505147457122803, - "learning_rate": 2.08055997895372e-06, - "loss": 0.0227, - "step": 1136 - }, - { - "epoch": 5.546341463414635, - "grad_norm": 2.5258476734161377, - "learning_rate": 2.0767835692726944e-06, - "loss": 0.0296, - "step": 1137 - }, - { - "epoch": 5.5512195121951216, - "grad_norm": 3.498741388320923, - "learning_rate": 2.0730081535186e-06, - "loss": 0.16, - "step": 1138 - }, - { - "epoch": 5.55609756097561, - "grad_norm": 2.8635222911834717, - "learning_rate": 2.06923374055803e-06, - "loss": 0.0725, - "step": 1139 - }, - { - "epoch": 5.560975609756097, - "grad_norm": 2.2779290676116943, - "learning_rate": 2.0654603392552193e-06, - "loss": 0.0198, - "step": 1140 - }, - { - "epoch": 5.565853658536585, - "grad_norm": 3.1651058197021484, - "learning_rate": 2.0616879584720305e-06, - "loss": 0.1144, - "step": 1141 - }, - { - "epoch": 5.570731707317073, - "grad_norm": 2.4238595962524414, - "learning_rate": 2.057916607067928e-06, - "loss": 0.0491, - "step": 1142 - }, - { - "epoch": 5.575609756097561, - "grad_norm": 2.3248515129089355, - "learning_rate": 2.054146293899957e-06, - "loss": 0.035, - "step": 1143 - }, - { - "epoch": 5.580487804878048, - "grad_norm": 2.9506516456604004, - "learning_rate": 2.0503770278227274e-06, - "loss": 0.0639, - "step": 1144 - }, - { - "epoch": 5.585365853658536, - "grad_norm": 2.6403958797454834, - "learning_rate": 2.0466088176883876e-06, - "loss": 0.0258, - "step": 1145 - }, - { - "epoch": 5.590243902439024, - "grad_norm": 3.150115728378296, - "learning_rate": 2.042841672346608e-06, - "loss": 0.0634, - "step": 1146 - }, - { - "epoch": 5.595121951219512, - "grad_norm": 2.742691993713379, - "learning_rate": 2.039075600644557e-06, - "loss": 0.0464, - "step": 1147 - }, - { - "epoch": 5.6, - "grad_norm": 2.733694076538086, - "learning_rate": 2.0353106114268824e-06, - "loss": 0.0829, - "step": 1148 - }, - { - "epoch": 5.6048780487804875, - "grad_norm": 2.511229991912842, - "learning_rate": 2.031546713535688e-06, - "loss": 0.0321, - "step": 1149 - }, - { - "epoch": 5.609756097560975, - "grad_norm": 3.019669532775879, - "learning_rate": 2.027783915810518e-06, - "loss": 0.05, - "step": 1150 - }, - { - "epoch": 5.614634146341463, - "grad_norm": 3.497159242630005, - "learning_rate": 2.024022227088329e-06, - "loss": 0.1984, - "step": 1151 - }, - { - "epoch": 5.619512195121951, - "grad_norm": 3.4637508392333984, - "learning_rate": 2.020261656203476e-06, - "loss": 0.1673, - "step": 1152 - }, - { - "epoch": 5.624390243902439, - "grad_norm": 2.4312477111816406, - "learning_rate": 2.016502211987687e-06, - "loss": 0.1106, - "step": 1153 - }, - { - "epoch": 5.6292682926829265, - "grad_norm": 2.7801673412323, - "learning_rate": 2.0127439032700446e-06, - "loss": 0.0374, - "step": 1154 - }, - { - "epoch": 5.634146341463414, - "grad_norm": 2.9346680641174316, - "learning_rate": 2.0089867388769664e-06, - "loss": 0.0674, - "step": 1155 - }, - { - "epoch": 5.639024390243902, - "grad_norm": 2.274888277053833, - "learning_rate": 2.0052307276321793e-06, - "loss": 0.0365, - "step": 1156 - }, - { - "epoch": 5.64390243902439, - "grad_norm": 3.069890022277832, - "learning_rate": 2.001475878356703e-06, - "loss": 0.0758, - "step": 1157 - }, - { - "epoch": 5.648780487804878, - "grad_norm": 3.8594915866851807, - "learning_rate": 1.99772219986883e-06, - "loss": 0.176, - "step": 1158 - }, - { - "epoch": 5.6536585365853655, - "grad_norm": 3.4886410236358643, - "learning_rate": 1.9939697009841024e-06, - "loss": 0.0491, - "step": 1159 - }, - { - "epoch": 5.658536585365853, - "grad_norm": 2.697946786880493, - "learning_rate": 1.990218390515291e-06, - "loss": 0.0741, - "step": 1160 - }, - { - "epoch": 5.663414634146341, - "grad_norm": 3.5290887355804443, - "learning_rate": 1.9864682772723757e-06, - "loss": 0.0826, - "step": 1161 - }, - { - "epoch": 5.668292682926829, - "grad_norm": 2.0601298809051514, - "learning_rate": 1.9827193700625274e-06, - "loss": 0.0378, - "step": 1162 - }, - { - "epoch": 5.673170731707317, - "grad_norm": 3.8458635807037354, - "learning_rate": 1.978971677690081e-06, - "loss": 0.2466, - "step": 1163 - }, - { - "epoch": 5.678048780487805, - "grad_norm": 2.788210153579712, - "learning_rate": 1.97522520895652e-06, - "loss": 0.0205, - "step": 1164 - }, - { - "epoch": 5.682926829268292, - "grad_norm": 3.1904587745666504, - "learning_rate": 1.971479972660454e-06, - "loss": 0.0998, - "step": 1165 - }, - { - "epoch": 5.68780487804878, - "grad_norm": 2.4664318561553955, - "learning_rate": 1.967735977597598e-06, - "loss": 0.0217, - "step": 1166 - }, - { - "epoch": 5.692682926829268, - "grad_norm": 2.1392667293548584, - "learning_rate": 1.9639932325607538e-06, - "loss": 0.048, - "step": 1167 - }, - { - "epoch": 5.697560975609756, - "grad_norm": 3.7127058506011963, - "learning_rate": 1.9602517463397845e-06, - "loss": 0.0302, - "step": 1168 - }, - { - "epoch": 5.702439024390244, - "grad_norm": 2.916168689727783, - "learning_rate": 1.9565115277215978e-06, - "loss": 0.0724, - "step": 1169 - }, - { - "epoch": 5.7073170731707314, - "grad_norm": 2.4352428913116455, - "learning_rate": 1.952772585490127e-06, - "loss": 0.0464, - "step": 1170 - }, - { - "epoch": 5.712195121951219, - "grad_norm": 2.8311455249786377, - "learning_rate": 1.9490349284263036e-06, - "loss": 0.0239, - "step": 1171 - }, - { - "epoch": 5.717073170731707, - "grad_norm": 3.3592801094055176, - "learning_rate": 1.9452985653080443e-06, - "loss": 0.0719, - "step": 1172 - }, - { - "epoch": 5.721951219512195, - "grad_norm": 2.450922966003418, - "learning_rate": 1.9415635049102245e-06, - "loss": 0.0408, - "step": 1173 - }, - { - "epoch": 5.726829268292683, - "grad_norm": 4.750118255615234, - "learning_rate": 1.937829756004662e-06, - "loss": 0.2049, - "step": 1174 - }, - { - "epoch": 5.7317073170731705, - "grad_norm": 3.0643811225891113, - "learning_rate": 1.9340973273600944e-06, - "loss": 0.0636, - "step": 1175 - }, - { - "epoch": 5.736585365853658, - "grad_norm": 3.313904047012329, - "learning_rate": 1.930366227742157e-06, - "loss": 0.1252, - "step": 1176 - }, - { - "epoch": 5.741463414634146, - "grad_norm": 3.8996808528900146, - "learning_rate": 1.9266364659133653e-06, - "loss": 0.0687, - "step": 1177 - }, - { - "epoch": 5.746341463414634, - "grad_norm": 2.727555274963379, - "learning_rate": 1.922908050633093e-06, - "loss": 0.0333, - "step": 1178 - }, - { - "epoch": 5.751219512195122, - "grad_norm": 3.270087718963623, - "learning_rate": 1.919180990657551e-06, - "loss": 0.0792, - "step": 1179 - }, - { - "epoch": 5.7560975609756095, - "grad_norm": 2.6631274223327637, - "learning_rate": 1.9154552947397668e-06, - "loss": 0.069, - "step": 1180 - }, - { - "epoch": 5.760975609756097, - "grad_norm": 4.4460554122924805, - "learning_rate": 1.9117309716295658e-06, - "loss": 0.115, - "step": 1181 - }, - { - "epoch": 5.765853658536585, - "grad_norm": 2.5652341842651367, - "learning_rate": 1.9080080300735478e-06, - "loss": 0.0537, - "step": 1182 - }, - { - "epoch": 5.770731707317073, - "grad_norm": 3.046436071395874, - "learning_rate": 1.9042864788150695e-06, - "loss": 0.0817, - "step": 1183 - }, - { - "epoch": 5.775609756097561, - "grad_norm": 2.121629238128662, - "learning_rate": 1.9005663265942206e-06, - "loss": 0.0289, - "step": 1184 - }, - { - "epoch": 5.780487804878049, - "grad_norm": 2.271918535232544, - "learning_rate": 1.8968475821478066e-06, - "loss": 0.0357, - "step": 1185 - }, - { - "epoch": 5.785365853658536, - "grad_norm": 2.582473039627075, - "learning_rate": 1.8931302542093274e-06, - "loss": 0.0584, - "step": 1186 - }, - { - "epoch": 5.790243902439024, - "grad_norm": 2.502952814102173, - "learning_rate": 1.8894143515089539e-06, - "loss": 0.0324, - "step": 1187 - }, - { - "epoch": 5.795121951219512, - "grad_norm": 1.9735453128814697, - "learning_rate": 1.8856998827735118e-06, - "loss": 0.0338, - "step": 1188 - }, - { - "epoch": 5.8, - "grad_norm": 4.441845893859863, - "learning_rate": 1.8819868567264588e-06, - "loss": 0.1706, - "step": 1189 - }, - { - "epoch": 5.804878048780488, - "grad_norm": 2.5450692176818848, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.0463, - "step": 1190 - }, - { - "epoch": 5.809756097560975, - "grad_norm": 3.718183755874634, - "learning_rate": 1.8745651675743876e-06, - "loss": 0.1188, - "step": 1191 - }, - { - "epoch": 5.814634146341463, - "grad_norm": 3.246532678604126, - "learning_rate": 1.870856521899261e-06, - "loss": 0.0984, - "step": 1192 - }, - { - "epoch": 5.819512195121951, - "grad_norm": 2.9522783756256104, - "learning_rate": 1.867149353772267e-06, - "loss": 0.0195, - "step": 1193 - }, - { - "epoch": 5.824390243902439, - "grad_norm": 2.3266429901123047, - "learning_rate": 1.863443671899717e-06, - "loss": 0.0236, - "step": 1194 - }, - { - "epoch": 5.829268292682927, - "grad_norm": 3.696749448776245, - "learning_rate": 1.8597394849844319e-06, - "loss": 0.1108, - "step": 1195 - }, - { - "epoch": 5.8341463414634145, - "grad_norm": 2.375624179840088, - "learning_rate": 1.8560368017257229e-06, - "loss": 0.0388, - "step": 1196 - }, - { - "epoch": 5.839024390243902, - "grad_norm": 4.0437092781066895, - "learning_rate": 1.8523356308193696e-06, - "loss": 0.3098, - "step": 1197 - }, - { - "epoch": 5.84390243902439, - "grad_norm": 3.165165424346924, - "learning_rate": 1.8486359809575977e-06, - "loss": 0.0775, - "step": 1198 - }, - { - "epoch": 5.848780487804878, - "grad_norm": 4.1991190910339355, - "learning_rate": 1.8449378608290638e-06, - "loss": 0.1222, - "step": 1199 - }, - { - "epoch": 5.853658536585366, - "grad_norm": 4.6657819747924805, - "learning_rate": 1.8412412791188306e-06, - "loss": 0.1146, - "step": 1200 - }, - { - "epoch": 5.8585365853658535, - "grad_norm": 4.569516181945801, - "learning_rate": 1.8375462445083464e-06, - "loss": 0.1113, - "step": 1201 - }, - { - "epoch": 5.863414634146341, - "grad_norm": 3.1565654277801514, - "learning_rate": 1.8338527656754285e-06, - "loss": 0.0416, - "step": 1202 - }, - { - "epoch": 5.868292682926829, - "grad_norm": 3.3474619388580322, - "learning_rate": 1.830160851294239e-06, - "loss": 0.0613, - "step": 1203 - }, - { - "epoch": 5.873170731707317, - "grad_norm": 4.30797004699707, - "learning_rate": 1.8264705100352662e-06, - "loss": 0.197, - "step": 1204 - }, - { - "epoch": 5.878048780487805, - "grad_norm": 2.7259573936462402, - "learning_rate": 1.8227817505653045e-06, - "loss": 0.0821, - "step": 1205 - }, - { - "epoch": 5.882926829268293, - "grad_norm": 3.515812873840332, - "learning_rate": 1.8190945815474323e-06, - "loss": 0.1246, - "step": 1206 - }, - { - "epoch": 5.88780487804878, - "grad_norm": 2.9223313331604004, - "learning_rate": 1.8154090116409934e-06, - "loss": 0.0703, - "step": 1207 - }, - { - "epoch": 5.892682926829268, - "grad_norm": 3.9529640674591064, - "learning_rate": 1.811725049501577e-06, - "loss": 0.1078, - "step": 1208 - }, - { - "epoch": 5.897560975609756, - "grad_norm": 4.1674580574035645, - "learning_rate": 1.8080427037809941e-06, - "loss": 0.1648, - "step": 1209 - }, - { - "epoch": 5.902439024390244, - "grad_norm": 3.1308021545410156, - "learning_rate": 1.8043619831272623e-06, - "loss": 0.061, - "step": 1210 - }, - { - "epoch": 5.907317073170732, - "grad_norm": 3.9667179584503174, - "learning_rate": 1.8006828961845807e-06, - "loss": 0.1863, - "step": 1211 - }, - { - "epoch": 5.912195121951219, - "grad_norm": 5.438168048858643, - "learning_rate": 1.7970054515933124e-06, - "loss": 0.2387, - "step": 1212 - }, - { - "epoch": 5.917073170731707, - "grad_norm": 5.505797863006592, - "learning_rate": 1.793329657989964e-06, - "loss": 0.2053, - "step": 1213 - }, - { - "epoch": 5.921951219512195, - "grad_norm": 2.8043150901794434, - "learning_rate": 1.7896555240071627e-06, - "loss": 0.026, - "step": 1214 - }, - { - "epoch": 5.926829268292683, - "grad_norm": 2.836164712905884, - "learning_rate": 1.7859830582736406e-06, - "loss": 0.0735, - "step": 1215 - }, - { - "epoch": 5.931707317073171, - "grad_norm": 2.8286306858062744, - "learning_rate": 1.782312269414211e-06, - "loss": 0.0586, - "step": 1216 - }, - { - "epoch": 5.9365853658536585, - "grad_norm": 4.4354329109191895, - "learning_rate": 1.7786431660497474e-06, - "loss": 0.3086, - "step": 1217 - }, - { - "epoch": 5.941463414634146, - "grad_norm": 4.0963640213012695, - "learning_rate": 1.7749757567971678e-06, - "loss": 0.0978, - "step": 1218 - }, - { - "epoch": 5.946341463414634, - "grad_norm": 2.726062536239624, - "learning_rate": 1.7713100502694091e-06, - "loss": 0.0976, - "step": 1219 - }, - { - "epoch": 5.951219512195122, - "grad_norm": 2.6566951274871826, - "learning_rate": 1.7676460550754104e-06, - "loss": 0.02, - "step": 1220 - }, - { - "epoch": 5.95609756097561, - "grad_norm": 2.7710952758789062, - "learning_rate": 1.7639837798200923e-06, - "loss": 0.0741, - "step": 1221 - }, - { - "epoch": 5.9609756097560975, - "grad_norm": 2.3678600788116455, - "learning_rate": 1.7603232331043346e-06, - "loss": 0.0542, - "step": 1222 - }, - { - "epoch": 5.965853658536585, - "grad_norm": 6.45259428024292, - "learning_rate": 1.7566644235249591e-06, - "loss": 0.3552, - "step": 1223 - }, - { - "epoch": 5.970731707317073, - "grad_norm": 1.8916475772857666, - "learning_rate": 1.7530073596747072e-06, - "loss": 0.0405, - "step": 1224 - }, - { - "epoch": 5.975609756097561, - "grad_norm": 2.1637566089630127, - "learning_rate": 1.74935205014222e-06, - "loss": 0.0178, - "step": 1225 - }, - { - "epoch": 5.980487804878049, - "grad_norm": 2.5959200859069824, - "learning_rate": 1.7456985035120194e-06, - "loss": 0.0264, - "step": 1226 - }, - { - "epoch": 5.985365853658537, - "grad_norm": 2.50264573097229, - "learning_rate": 1.7420467283644877e-06, - "loss": 0.0555, - "step": 1227 - }, - { - "epoch": 5.990243902439024, - "grad_norm": 2.4692020416259766, - "learning_rate": 1.738396733275844e-06, - "loss": 0.0546, - "step": 1228 - }, - { - "epoch": 5.995121951219512, - "grad_norm": 5.540846824645996, - "learning_rate": 1.7347485268181309e-06, - "loss": 0.1967, - "step": 1229 - }, - { - "epoch": 6.0, - "grad_norm": 1.8322839736938477, - "learning_rate": 1.7311021175591868e-06, - "loss": 0.0491, - "step": 1230 - }, - { - "epoch": 6.004878048780488, - "grad_norm": 2.719622850418091, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.0359, - "step": 1231 - }, - { - "epoch": 6.009756097560976, - "grad_norm": 2.859675884246826, - "learning_rate": 1.7238147248878444e-06, - "loss": 0.0585, - "step": 1232 - }, - { - "epoch": 6.014634146341463, - "grad_norm": 1.6761114597320557, - "learning_rate": 1.7201737585899415e-06, - "loss": 0.0188, - "step": 1233 - }, - { - "epoch": 6.019512195121951, - "grad_norm": 2.1588776111602783, - "learning_rate": 1.7165346237197594e-06, - "loss": 0.0484, - "step": 1234 - }, - { - "epoch": 6.024390243902439, - "grad_norm": 4.209983825683594, - "learning_rate": 1.7128973288238344e-06, - "loss": 0.0776, - "step": 1235 - }, - { - "epoch": 6.029268292682927, - "grad_norm": 2.3979365825653076, - "learning_rate": 1.709261882444379e-06, - "loss": 0.0338, - "step": 1236 - }, - { - "epoch": 6.034146341463415, - "grad_norm": 3.0030531883239746, - "learning_rate": 1.705628293119268e-06, - "loss": 0.0385, - "step": 1237 - }, - { - "epoch": 6.0390243902439025, - "grad_norm": 9.65616512298584, - "learning_rate": 1.701996569382011e-06, - "loss": 0.2601, - "step": 1238 - }, - { - "epoch": 6.04390243902439, - "grad_norm": 3.0590052604675293, - "learning_rate": 1.6983667197617386e-06, - "loss": 0.034, - "step": 1239 - }, - { - "epoch": 6.048780487804878, - "grad_norm": 3.6949822902679443, - "learning_rate": 1.6947387527831813e-06, - "loss": 0.0155, - "step": 1240 - }, - { - "epoch": 6.053658536585366, - "grad_norm": 1.2870460748672485, - "learning_rate": 1.6911126769666442e-06, - "loss": 0.0078, - "step": 1241 - }, - { - "epoch": 6.058536585365854, - "grad_norm": 4.307460784912109, - "learning_rate": 1.6874885008279945e-06, - "loss": 0.1429, - "step": 1242 - }, - { - "epoch": 6.0634146341463415, - "grad_norm": 2.334972858428955, - "learning_rate": 1.683866232878637e-06, - "loss": 0.0123, - "step": 1243 - }, - { - "epoch": 6.068292682926829, - "grad_norm": 2.4121835231781006, - "learning_rate": 1.6802458816254941e-06, - "loss": 0.0139, - "step": 1244 - }, - { - "epoch": 6.073170731707317, - "grad_norm": 1.9224514961242676, - "learning_rate": 1.676627455570988e-06, - "loss": 0.0312, - "step": 1245 - }, - { - "epoch": 6.078048780487805, - "grad_norm": 2.8293309211730957, - "learning_rate": 1.6730109632130199e-06, - "loss": 0.0464, - "step": 1246 - }, - { - "epoch": 6.082926829268293, - "grad_norm": 1.6368179321289062, - "learning_rate": 1.6693964130449472e-06, - "loss": 0.0085, - "step": 1247 - }, - { - "epoch": 6.087804878048781, - "grad_norm": 2.5535073280334473, - "learning_rate": 1.6657838135555696e-06, - "loss": 0.0482, - "step": 1248 - }, - { - "epoch": 6.092682926829268, - "grad_norm": 3.7743096351623535, - "learning_rate": 1.6621731732291024e-06, - "loss": 0.0235, - "step": 1249 - }, - { - "epoch": 6.097560975609756, - "grad_norm": 2.9921820163726807, - "learning_rate": 1.6585645005451623e-06, - "loss": 0.0455, - "step": 1250 - }, - { - "epoch": 6.102439024390244, - "grad_norm": 2.369581937789917, - "learning_rate": 1.6549578039787436e-06, - "loss": 0.0499, - "step": 1251 - }, - { - "epoch": 6.107317073170732, - "grad_norm": 2.163815498352051, - "learning_rate": 1.6513530920001998e-06, - "loss": 0.0118, - "step": 1252 - }, - { - "epoch": 6.11219512195122, - "grad_norm": 2.034928560256958, - "learning_rate": 1.6477503730752237e-06, - "loss": 0.0189, - "step": 1253 - }, - { - "epoch": 6.117073170731707, - "grad_norm": 2.7306160926818848, - "learning_rate": 1.6441496556648278e-06, - "loss": 0.0492, - "step": 1254 - }, - { - "epoch": 6.121951219512195, - "grad_norm": 3.7521040439605713, - "learning_rate": 1.6405509482253234e-06, - "loss": 0.1717, - "step": 1255 - }, - { - "epoch": 6.126829268292683, - "grad_norm": 1.8965831995010376, - "learning_rate": 1.636954259208302e-06, - "loss": 0.0194, - "step": 1256 - }, - { - "epoch": 6.131707317073171, - "grad_norm": 3.010024070739746, - "learning_rate": 1.6333595970606143e-06, - "loss": 0.0334, - "step": 1257 - }, - { - "epoch": 6.136585365853659, - "grad_norm": 3.7091450691223145, - "learning_rate": 1.62976697022435e-06, - "loss": 0.0705, - "step": 1258 - }, - { - "epoch": 6.1414634146341465, - "grad_norm": 3.5719785690307617, - "learning_rate": 1.6261763871368225e-06, - "loss": 0.0322, - "step": 1259 - }, - { - "epoch": 6.146341463414634, - "grad_norm": 3.3224213123321533, - "learning_rate": 1.6225878562305403e-06, - "loss": 0.0653, - "step": 1260 - }, - { - "epoch": 6.151219512195122, - "grad_norm": 3.78924822807312, - "learning_rate": 1.6190013859331958e-06, - "loss": 0.0557, - "step": 1261 - }, - { - "epoch": 6.15609756097561, - "grad_norm": 2.429412841796875, - "learning_rate": 1.6154169846676415e-06, - "loss": 0.0277, - "step": 1262 - }, - { - "epoch": 6.160975609756098, - "grad_norm": 2.626167058944702, - "learning_rate": 1.6118346608518698e-06, - "loss": 0.0305, - "step": 1263 - }, - { - "epoch": 6.1658536585365855, - "grad_norm": 2.44846248626709, - "learning_rate": 1.6082544228989958e-06, - "loss": 0.0093, - "step": 1264 - }, - { - "epoch": 6.170731707317073, - "grad_norm": 2.9345643520355225, - "learning_rate": 1.6046762792172336e-06, - "loss": 0.0198, - "step": 1265 - }, - { - "epoch": 6.175609756097561, - "grad_norm": 3.224313497543335, - "learning_rate": 1.6011002382098806e-06, - "loss": 0.0673, - "step": 1266 - }, - { - "epoch": 6.180487804878049, - "grad_norm": 1.9066869020462036, - "learning_rate": 1.5975263082752968e-06, - "loss": 0.0115, - "step": 1267 - }, - { - "epoch": 6.185365853658537, - "grad_norm": 2.7153308391571045, - "learning_rate": 1.5939544978068816e-06, - "loss": 0.0529, - "step": 1268 - }, - { - "epoch": 6.190243902439025, - "grad_norm": 2.2173709869384766, - "learning_rate": 1.590384815193059e-06, - "loss": 0.0643, - "step": 1269 - }, - { - "epoch": 6.195121951219512, - "grad_norm": 3.1238555908203125, - "learning_rate": 1.5868172688172559e-06, - "loss": 0.064, - "step": 1270 - }, - { - "epoch": 6.2, - "grad_norm": 2.7765870094299316, - "learning_rate": 1.5832518670578802e-06, - "loss": 0.0676, - "step": 1271 - }, - { - "epoch": 6.204878048780488, - "grad_norm": 2.9892525672912598, - "learning_rate": 1.5796886182883053e-06, - "loss": 0.074, - "step": 1272 - }, - { - "epoch": 6.209756097560976, - "grad_norm": 2.0955512523651123, - "learning_rate": 1.5761275308768476e-06, - "loss": 0.0311, - "step": 1273 - }, - { - "epoch": 6.214634146341464, - "grad_norm": 1.8085861206054688, - "learning_rate": 1.5725686131867462e-06, - "loss": 0.0108, - "step": 1274 - }, - { - "epoch": 6.219512195121951, - "grad_norm": 3.026421308517456, - "learning_rate": 1.569011873576147e-06, - "loss": 0.0464, - "step": 1275 - }, - { - "epoch": 6.224390243902439, - "grad_norm": 2.3395111560821533, - "learning_rate": 1.5654573203980782e-06, - "loss": 0.0221, - "step": 1276 - }, - { - "epoch": 6.229268292682927, - "grad_norm": 3.6158692836761475, - "learning_rate": 1.5619049620004354e-06, - "loss": 0.0693, - "step": 1277 - }, - { - "epoch": 6.234146341463415, - "grad_norm": 1.6186567544937134, - "learning_rate": 1.5583548067259584e-06, - "loss": 0.0198, - "step": 1278 - }, - { - "epoch": 6.239024390243903, - "grad_norm": 2.7193195819854736, - "learning_rate": 1.5548068629122126e-06, - "loss": 0.0687, - "step": 1279 - }, - { - "epoch": 6.2439024390243905, - "grad_norm": 2.7472658157348633, - "learning_rate": 1.5512611388915711e-06, - "loss": 0.053, - "step": 1280 - }, - { - "epoch": 6.248780487804878, - "grad_norm": 4.694706439971924, - "learning_rate": 1.5477176429911934e-06, - "loss": 0.2076, - "step": 1281 - }, - { - "epoch": 6.253658536585366, - "grad_norm": 1.609309434890747, - "learning_rate": 1.5441763835330048e-06, - "loss": 0.0108, - "step": 1282 - }, - { - "epoch": 6.258536585365854, - "grad_norm": 1.7064504623413086, - "learning_rate": 1.5406373688336807e-06, - "loss": 0.0114, - "step": 1283 - }, - { - "epoch": 6.263414634146342, - "grad_norm": 1.967726469039917, - "learning_rate": 1.5371006072046225e-06, - "loss": 0.0209, - "step": 1284 - }, - { - "epoch": 6.2682926829268295, - "grad_norm": 2.4065544605255127, - "learning_rate": 1.5335661069519408e-06, - "loss": 0.0741, - "step": 1285 - }, - { - "epoch": 6.273170731707317, - "grad_norm": 2.2167603969573975, - "learning_rate": 1.5300338763764371e-06, - "loss": 0.0121, - "step": 1286 - }, - { - "epoch": 6.278048780487805, - "grad_norm": 3.229228973388672, - "learning_rate": 1.5265039237735804e-06, - "loss": 0.0226, - "step": 1287 - }, - { - "epoch": 6.282926829268293, - "grad_norm": 1.889419674873352, - "learning_rate": 1.5229762574334903e-06, - "loss": 0.0116, - "step": 1288 - }, - { - "epoch": 6.287804878048781, - "grad_norm": 3.7595815658569336, - "learning_rate": 1.5194508856409181e-06, - "loss": 0.0775, - "step": 1289 - }, - { - "epoch": 6.2926829268292686, - "grad_norm": 2.527560234069824, - "learning_rate": 1.515927816675225e-06, - "loss": 0.0355, - "step": 1290 - }, - { - "epoch": 6.297560975609756, - "grad_norm": 1.9718955755233765, - "learning_rate": 1.5124070588103648e-06, - "loss": 0.0127, - "step": 1291 - }, - { - "epoch": 6.302439024390244, - "grad_norm": 1.9010120630264282, - "learning_rate": 1.5088886203148643e-06, - "loss": 0.0188, - "step": 1292 - }, - { - "epoch": 6.307317073170732, - "grad_norm": 3.2093472480773926, - "learning_rate": 1.505372509451801e-06, - "loss": 0.0845, - "step": 1293 - }, - { - "epoch": 6.31219512195122, - "grad_norm": 1.6723257303237915, - "learning_rate": 1.5018587344787888e-06, - "loss": 0.0265, - "step": 1294 - }, - { - "epoch": 6.317073170731708, - "grad_norm": 3.246812343597412, - "learning_rate": 1.498347303647953e-06, - "loss": 0.0833, - "step": 1295 - }, - { - "epoch": 6.321951219512195, - "grad_norm": 2.887834072113037, - "learning_rate": 1.4948382252059158e-06, - "loss": 0.0416, - "step": 1296 - }, - { - "epoch": 6.326829268292683, - "grad_norm": 2.5762557983398438, - "learning_rate": 1.4913315073937742e-06, - "loss": 0.0614, - "step": 1297 - }, - { - "epoch": 6.331707317073171, - "grad_norm": 3.3746497631073, - "learning_rate": 1.4878271584470805e-06, - "loss": 0.0601, - "step": 1298 - }, - { - "epoch": 6.336585365853659, - "grad_norm": 2.4984664916992188, - "learning_rate": 1.4843251865958242e-06, - "loss": 0.0189, - "step": 1299 - }, - { - "epoch": 6.341463414634147, - "grad_norm": 3.178300619125366, - "learning_rate": 1.4808256000644128e-06, - "loss": 0.038, - "step": 1300 - }, - { - "epoch": 6.3463414634146345, - "grad_norm": 2.6362273693084717, - "learning_rate": 1.4773284070716504e-06, - "loss": 0.041, - "step": 1301 - }, - { - "epoch": 6.351219512195122, - "grad_norm": 2.1512129306793213, - "learning_rate": 1.473833615830722e-06, - "loss": 0.0227, - "step": 1302 - }, - { - "epoch": 6.35609756097561, - "grad_norm": 2.2898178100585938, - "learning_rate": 1.4703412345491692e-06, - "loss": 0.039, - "step": 1303 - }, - { - "epoch": 6.360975609756098, - "grad_norm": 2.6641080379486084, - "learning_rate": 1.4668512714288763e-06, - "loss": 0.0431, - "step": 1304 - }, - { - "epoch": 6.365853658536586, - "grad_norm": 1.7466667890548706, - "learning_rate": 1.4633637346660478e-06, - "loss": 0.013, - "step": 1305 - }, - { - "epoch": 6.3707317073170735, - "grad_norm": 2.437889575958252, - "learning_rate": 1.4598786324511892e-06, - "loss": 0.0181, - "step": 1306 - }, - { - "epoch": 6.375609756097561, - "grad_norm": 2.5054142475128174, - "learning_rate": 1.456395972969089e-06, - "loss": 0.0248, - "step": 1307 - }, - { - "epoch": 6.380487804878049, - "grad_norm": 3.2294511795043945, - "learning_rate": 1.4529157643987995e-06, - "loss": 0.0561, - "step": 1308 - }, - { - "epoch": 6.385365853658537, - "grad_norm": 2.260188341140747, - "learning_rate": 1.4494380149136162e-06, - "loss": 0.0593, - "step": 1309 - }, - { - "epoch": 6.390243902439025, - "grad_norm": 2.4961163997650146, - "learning_rate": 1.4459627326810576e-06, - "loss": 0.0257, - "step": 1310 - }, - { - "epoch": 6.3951219512195125, - "grad_norm": 3.4153239727020264, - "learning_rate": 1.4424899258628533e-06, - "loss": 0.0223, - "step": 1311 - }, - { - "epoch": 6.4, - "grad_norm": 2.6308839321136475, - "learning_rate": 1.439019602614914e-06, - "loss": 0.0112, - "step": 1312 - }, - { - "epoch": 6.404878048780488, - "grad_norm": 2.754530191421509, - "learning_rate": 1.4355517710873184e-06, - "loss": 0.068, - "step": 1313 - }, - { - "epoch": 6.409756097560976, - "grad_norm": 4.473151683807373, - "learning_rate": 1.432086439424297e-06, - "loss": 0.0825, - "step": 1314 - }, - { - "epoch": 6.414634146341464, - "grad_norm": 4.85701322555542, - "learning_rate": 1.428623615764206e-06, - "loss": 0.1812, - "step": 1315 - }, - { - "epoch": 6.419512195121952, - "grad_norm": 1.6678224802017212, - "learning_rate": 1.4251633082395117e-06, - "loss": 0.0207, - "step": 1316 - }, - { - "epoch": 6.424390243902439, - "grad_norm": 2.9730937480926514, - "learning_rate": 1.4217055249767734e-06, - "loss": 0.0617, - "step": 1317 - }, - { - "epoch": 6.429268292682927, - "grad_norm": 2.503786563873291, - "learning_rate": 1.4182502740966203e-06, - "loss": 0.0137, - "step": 1318 - }, - { - "epoch": 6.434146341463415, - "grad_norm": 3.0798017978668213, - "learning_rate": 1.4147975637137334e-06, - "loss": 0.0329, - "step": 1319 - }, - { - "epoch": 6.439024390243903, - "grad_norm": 3.008155345916748, - "learning_rate": 1.411347401936831e-06, - "loss": 0.0487, - "step": 1320 - }, - { - "epoch": 6.443902439024391, - "grad_norm": 2.5451765060424805, - "learning_rate": 1.4078997968686425e-06, - "loss": 0.0582, - "step": 1321 - }, - { - "epoch": 6.4487804878048784, - "grad_norm": 2.042696475982666, - "learning_rate": 1.404454756605893e-06, - "loss": 0.0336, - "step": 1322 - }, - { - "epoch": 6.453658536585366, - "grad_norm": 3.0421411991119385, - "learning_rate": 1.4010122892392872e-06, - "loss": 0.1372, - "step": 1323 - }, - { - "epoch": 6.458536585365854, - "grad_norm": 2.0793251991271973, - "learning_rate": 1.3975724028534842e-06, - "loss": 0.0452, - "step": 1324 - }, - { - "epoch": 6.463414634146342, - "grad_norm": 2.6149914264678955, - "learning_rate": 1.394135105527083e-06, - "loss": 0.0431, - "step": 1325 - }, - { - "epoch": 6.46829268292683, - "grad_norm": 2.818507671356201, - "learning_rate": 1.3907004053326006e-06, - "loss": 0.0242, - "step": 1326 - }, - { - "epoch": 6.473170731707317, - "grad_norm": 2.328993558883667, - "learning_rate": 1.387268310336458e-06, - "loss": 0.0293, - "step": 1327 - }, - { - "epoch": 6.478048780487805, - "grad_norm": 2.2032642364501953, - "learning_rate": 1.3838388285989552e-06, - "loss": 0.0232, - "step": 1328 - }, - { - "epoch": 6.482926829268292, - "grad_norm": 2.039983034133911, - "learning_rate": 1.380411968174254e-06, - "loss": 0.0256, - "step": 1329 - }, - { - "epoch": 6.487804878048781, - "grad_norm": 3.7261271476745605, - "learning_rate": 1.3769877371103635e-06, - "loss": 0.1285, - "step": 1330 - }, - { - "epoch": 6.492682926829268, - "grad_norm": 3.7156264781951904, - "learning_rate": 1.373566143449115e-06, - "loss": 0.1621, - "step": 1331 - }, - { - "epoch": 6.4975609756097565, - "grad_norm": 1.5905455350875854, - "learning_rate": 1.3701471952261457e-06, - "loss": 0.0126, - "step": 1332 - }, - { - "epoch": 6.5024390243902435, - "grad_norm": 2.8808465003967285, - "learning_rate": 1.3667309004708832e-06, - "loss": 0.0211, - "step": 1333 - }, - { - "epoch": 6.507317073170732, - "grad_norm": 3.9190757274627686, - "learning_rate": 1.3633172672065195e-06, - "loss": 0.062, - "step": 1334 - }, - { - "epoch": 6.512195121951219, - "grad_norm": 1.6948635578155518, - "learning_rate": 1.359906303449997e-06, - "loss": 0.0126, - "step": 1335 - }, - { - "epoch": 6.517073170731708, - "grad_norm": 2.3967642784118652, - "learning_rate": 1.3564980172119913e-06, - "loss": 0.0111, - "step": 1336 - }, - { - "epoch": 6.521951219512195, - "grad_norm": 3.5275399684906006, - "learning_rate": 1.3530924164968873e-06, - "loss": 0.1024, - "step": 1337 - }, - { - "epoch": 6.526829268292683, - "grad_norm": 2.0768814086914062, - "learning_rate": 1.3496895093027617e-06, - "loss": 0.0254, - "step": 1338 - }, - { - "epoch": 6.53170731707317, - "grad_norm": 1.8964029550552368, - "learning_rate": 1.3462893036213706e-06, - "loss": 0.0188, - "step": 1339 - }, - { - "epoch": 6.536585365853659, - "grad_norm": 1.679545283317566, - "learning_rate": 1.3428918074381203e-06, - "loss": 0.0195, - "step": 1340 - }, - { - "epoch": 6.541463414634146, - "grad_norm": 2.204637050628662, - "learning_rate": 1.3394970287320553e-06, - "loss": 0.0317, - "step": 1341 - }, - { - "epoch": 6.546341463414635, - "grad_norm": 2.014052629470825, - "learning_rate": 1.3361049754758404e-06, - "loss": 0.0191, - "step": 1342 - }, - { - "epoch": 6.5512195121951216, - "grad_norm": 1.4630589485168457, - "learning_rate": 1.3327156556357369e-06, - "loss": 0.0079, - "step": 1343 - }, - { - "epoch": 6.55609756097561, - "grad_norm": 2.876132011413574, - "learning_rate": 1.3293290771715875e-06, - "loss": 0.0345, - "step": 1344 - }, - { - "epoch": 6.560975609756097, - "grad_norm": 1.793338656425476, - "learning_rate": 1.3259452480367963e-06, - "loss": 0.0409, - "step": 1345 - }, - { - "epoch": 6.565853658536585, - "grad_norm": 2.2791552543640137, - "learning_rate": 1.3225641761783126e-06, - "loss": 0.0494, - "step": 1346 - }, - { - "epoch": 6.570731707317073, - "grad_norm": 4.255206108093262, - "learning_rate": 1.3191858695366084e-06, - "loss": 0.0842, - "step": 1347 - }, - { - "epoch": 6.575609756097561, - "grad_norm": 2.449460506439209, - "learning_rate": 1.3158103360456603e-06, - "loss": 0.0399, - "step": 1348 - }, - { - "epoch": 6.580487804878048, - "grad_norm": 2.780730724334717, - "learning_rate": 1.3124375836329362e-06, - "loss": 0.0272, - "step": 1349 - }, - { - "epoch": 6.585365853658536, - "grad_norm": 1.925681233406067, - "learning_rate": 1.3090676202193692e-06, - "loss": 0.007, - "step": 1350 - }, - { - "epoch": 6.590243902439024, - "grad_norm": 2.069791555404663, - "learning_rate": 1.3057004537193424e-06, - "loss": 0.016, - "step": 1351 - }, - { - "epoch": 6.595121951219512, - "grad_norm": 1.863872766494751, - "learning_rate": 1.302336092040673e-06, - "loss": 0.016, - "step": 1352 - }, - { - "epoch": 6.6, - "grad_norm": 2.351259231567383, - "learning_rate": 1.298974543084589e-06, - "loss": 0.0172, - "step": 1353 - }, - { - "epoch": 6.6048780487804875, - "grad_norm": 1.848115086555481, - "learning_rate": 1.2956158147457116e-06, - "loss": 0.0412, - "step": 1354 - }, - { - "epoch": 6.609756097560975, - "grad_norm": 1.6395928859710693, - "learning_rate": 1.2922599149120412e-06, - "loss": 0.0181, - "step": 1355 - }, - { - "epoch": 6.614634146341463, - "grad_norm": 2.1267426013946533, - "learning_rate": 1.2889068514649328e-06, - "loss": 0.04, - "step": 1356 - }, - { - "epoch": 6.619512195121951, - "grad_norm": 1.6603496074676514, - "learning_rate": 1.2855566322790796e-06, - "loss": 0.0108, - "step": 1357 - }, - { - "epoch": 6.624390243902439, - "grad_norm": 2.2724838256835938, - "learning_rate": 1.2822092652224989e-06, - "loss": 0.0284, - "step": 1358 - }, - { - "epoch": 6.6292682926829265, - "grad_norm": 2.222623825073242, - "learning_rate": 1.2788647581565048e-06, - "loss": 0.0128, - "step": 1359 - }, - { - "epoch": 6.634146341463414, - "grad_norm": 2.710681676864624, - "learning_rate": 1.275523118935697e-06, - "loss": 0.0184, - "step": 1360 - }, - { - "epoch": 6.639024390243902, - "grad_norm": 2.354264736175537, - "learning_rate": 1.2721843554079418e-06, - "loss": 0.0313, - "step": 1361 - }, - { - "epoch": 6.64390243902439, - "grad_norm": 3.886909008026123, - "learning_rate": 1.2688484754143493e-06, - "loss": 0.1184, - "step": 1362 - }, - { - "epoch": 6.648780487804878, - "grad_norm": 3.088468313217163, - "learning_rate": 1.2655154867892577e-06, - "loss": 0.0353, - "step": 1363 - }, - { - "epoch": 6.6536585365853655, - "grad_norm": 2.987576484680176, - "learning_rate": 1.2621853973602158e-06, - "loss": 0.0349, - "step": 1364 - }, - { - "epoch": 6.658536585365853, - "grad_norm": 1.719212293624878, - "learning_rate": 1.2588582149479645e-06, - "loss": 0.0081, - "step": 1365 - }, - { - "epoch": 6.663414634146341, - "grad_norm": 2.1641178131103516, - "learning_rate": 1.2555339473664151e-06, - "loss": 0.0279, - "step": 1366 - }, - { - "epoch": 6.668292682926829, - "grad_norm": 2.9424984455108643, - "learning_rate": 1.2522126024226347e-06, - "loss": 0.0492, - "step": 1367 - }, - { - "epoch": 6.673170731707317, - "grad_norm": 1.961077332496643, - "learning_rate": 1.2488941879168278e-06, - "loss": 0.0084, - "step": 1368 - }, - { - "epoch": 6.678048780487805, - "grad_norm": 2.302565097808838, - "learning_rate": 1.2455787116423148e-06, - "loss": 0.0486, - "step": 1369 - }, - { - "epoch": 6.682926829268292, - "grad_norm": 2.187194347381592, - "learning_rate": 1.2422661813855158e-06, - "loss": 0.0319, - "step": 1370 - }, - { - "epoch": 6.68780487804878, - "grad_norm": 2.0076377391815186, - "learning_rate": 1.238956604925934e-06, - "loss": 0.016, - "step": 1371 - }, - { - "epoch": 6.692682926829268, - "grad_norm": 4.137681484222412, - "learning_rate": 1.2356499900361333e-06, - "loss": 0.0557, - "step": 1372 - }, - { - "epoch": 6.697560975609756, - "grad_norm": 2.0039637088775635, - "learning_rate": 1.2323463444817227e-06, - "loss": 0.0219, - "step": 1373 - }, - { - "epoch": 6.702439024390244, - "grad_norm": 2.943314552307129, - "learning_rate": 1.2290456760213405e-06, - "loss": 0.0849, - "step": 1374 - }, - { - "epoch": 6.7073170731707314, - "grad_norm": 2.715120553970337, - "learning_rate": 1.2257479924066296e-06, - "loss": 0.0857, - "step": 1375 - }, - { - "epoch": 6.712195121951219, - "grad_norm": 3.144104480743408, - "learning_rate": 1.2224533013822237e-06, - "loss": 0.0648, - "step": 1376 - }, - { - "epoch": 6.717073170731707, - "grad_norm": 2.830066680908203, - "learning_rate": 1.2191616106857312e-06, - "loss": 0.0426, - "step": 1377 - }, - { - "epoch": 6.721951219512195, - "grad_norm": 3.1005899906158447, - "learning_rate": 1.2158729280477112e-06, - "loss": 0.0478, - "step": 1378 - }, - { - "epoch": 6.726829268292683, - "grad_norm": 2.2102460861206055, - "learning_rate": 1.2125872611916578e-06, - "loss": 0.0273, - "step": 1379 - }, - { - "epoch": 6.7317073170731705, - "grad_norm": 2.860288619995117, - "learning_rate": 1.2093046178339869e-06, - "loss": 0.0201, - "step": 1380 - }, - { - "epoch": 6.736585365853658, - "grad_norm": 1.5914067029953003, - "learning_rate": 1.206025005684009e-06, - "loss": 0.0148, - "step": 1381 - }, - { - "epoch": 6.741463414634146, - "grad_norm": 1.8609223365783691, - "learning_rate": 1.202748432443918e-06, - "loss": 0.0073, - "step": 1382 - }, - { - "epoch": 6.746341463414634, - "grad_norm": 3.0532407760620117, - "learning_rate": 1.1994749058087695e-06, - "loss": 0.0344, - "step": 1383 - }, - { - "epoch": 6.751219512195122, - "grad_norm": 4.0601677894592285, - "learning_rate": 1.196204433466467e-06, - "loss": 0.0837, - "step": 1384 - }, - { - "epoch": 6.7560975609756095, - "grad_norm": 2.6982672214508057, - "learning_rate": 1.192937023097738e-06, - "loss": 0.0425, - "step": 1385 - }, - { - "epoch": 6.760975609756097, - "grad_norm": 1.431360125541687, - "learning_rate": 1.1896726823761195e-06, - "loss": 0.0065, - "step": 1386 - }, - { - "epoch": 6.765853658536585, - "grad_norm": 2.116907835006714, - "learning_rate": 1.1864114189679413e-06, - "loss": 0.0133, - "step": 1387 - }, - { - "epoch": 6.770731707317073, - "grad_norm": 2.6869874000549316, - "learning_rate": 1.183153240532304e-06, - "loss": 0.0188, - "step": 1388 - }, - { - "epoch": 6.775609756097561, - "grad_norm": 2.0294089317321777, - "learning_rate": 1.179898154721063e-06, - "loss": 0.0234, - "step": 1389 - }, - { - "epoch": 6.780487804878049, - "grad_norm": 2.3081958293914795, - "learning_rate": 1.1766461691788137e-06, - "loss": 0.0208, - "step": 1390 - }, - { - "epoch": 6.785365853658536, - "grad_norm": 3.4795000553131104, - "learning_rate": 1.1733972915428665e-06, - "loss": 0.0728, - "step": 1391 - }, - { - "epoch": 6.790243902439024, - "grad_norm": 2.5121219158172607, - "learning_rate": 1.1701515294432348e-06, - "loss": 0.0291, - "step": 1392 - }, - { - "epoch": 6.795121951219512, - "grad_norm": 5.1100172996521, - "learning_rate": 1.1669088905026156e-06, - "loss": 0.0988, - "step": 1393 - }, - { - "epoch": 6.8, - "grad_norm": 2.5434396266937256, - "learning_rate": 1.163669382336371e-06, - "loss": 0.0399, - "step": 1394 - }, - { - "epoch": 6.804878048780488, - "grad_norm": 2.7811660766601562, - "learning_rate": 1.160433012552508e-06, - "loss": 0.0134, - "step": 1395 - }, - { - "epoch": 6.809756097560975, - "grad_norm": 3.2409870624542236, - "learning_rate": 1.1571997887516672e-06, - "loss": 0.0795, - "step": 1396 - }, - { - "epoch": 6.814634146341463, - "grad_norm": 2.5300986766815186, - "learning_rate": 1.1539697185270982e-06, - "loss": 0.0329, - "step": 1397 - }, - { - "epoch": 6.819512195121951, - "grad_norm": 1.8510549068450928, - "learning_rate": 1.1507428094646448e-06, - "loss": 0.0213, - "step": 1398 - }, - { - "epoch": 6.824390243902439, - "grad_norm": 1.8820618391036987, - "learning_rate": 1.1475190691427255e-06, - "loss": 0.0172, - "step": 1399 - }, - { - "epoch": 6.829268292682927, - "grad_norm": 1.3415460586547852, - "learning_rate": 1.1442985051323205e-06, - "loss": 0.0029, - "step": 1400 - }, - { - "epoch": 6.8341463414634145, - "grad_norm": 6.033786296844482, - "learning_rate": 1.1410811249969475e-06, - "loss": 0.1638, - "step": 1401 - }, - { - "epoch": 6.839024390243902, - "grad_norm": 2.990328311920166, - "learning_rate": 1.1378669362926468e-06, - "loss": 0.0779, - "step": 1402 - }, - { - "epoch": 6.84390243902439, - "grad_norm": 3.2766308784484863, - "learning_rate": 1.1346559465679656e-06, - "loss": 0.0528, - "step": 1403 - }, - { - "epoch": 6.848780487804878, - "grad_norm": 1.266032338142395, - "learning_rate": 1.1314481633639374e-06, - "loss": 0.0057, - "step": 1404 - }, - { - "epoch": 6.853658536585366, - "grad_norm": 3.1048431396484375, - "learning_rate": 1.1282435942140632e-06, - "loss": 0.1772, - "step": 1405 - }, - { - "epoch": 6.8585365853658535, - "grad_norm": 2.264822483062744, - "learning_rate": 1.1250422466442992e-06, - "loss": 0.0176, - "step": 1406 - }, - { - "epoch": 6.863414634146341, - "grad_norm": 2.0890846252441406, - "learning_rate": 1.1218441281730334e-06, - "loss": 0.0184, - "step": 1407 - }, - { - "epoch": 6.868292682926829, - "grad_norm": 1.8351202011108398, - "learning_rate": 1.1186492463110696e-06, - "loss": 0.0127, - "step": 1408 - }, - { - "epoch": 6.873170731707317, - "grad_norm": 1.447196125984192, - "learning_rate": 1.1154576085616135e-06, - "loss": 0.0094, - "step": 1409 - }, - { - "epoch": 6.878048780487805, - "grad_norm": 1.6414039134979248, - "learning_rate": 1.1122692224202491e-06, - "loss": 0.0138, - "step": 1410 - }, - { - "epoch": 6.882926829268293, - "grad_norm": 2.87068772315979, - "learning_rate": 1.1090840953749253e-06, - "loss": 0.0821, - "step": 1411 - }, - { - "epoch": 6.88780487804878, - "grad_norm": 2.0476415157318115, - "learning_rate": 1.1059022349059362e-06, - "loss": 0.0222, - "step": 1412 - }, - { - "epoch": 6.892682926829268, - "grad_norm": 4.169386863708496, - "learning_rate": 1.102723648485905e-06, - "loss": 0.1183, - "step": 1413 - }, - { - "epoch": 6.897560975609756, - "grad_norm": 4.47883415222168, - "learning_rate": 1.0995483435797643e-06, - "loss": 0.0528, - "step": 1414 - }, - { - "epoch": 6.902439024390244, - "grad_norm": 2.0025508403778076, - "learning_rate": 1.0963763276447435e-06, - "loss": 0.0106, - "step": 1415 - }, - { - "epoch": 6.907317073170732, - "grad_norm": 2.4212136268615723, - "learning_rate": 1.0932076081303442e-06, - "loss": 0.0454, - "step": 1416 - }, - { - "epoch": 6.912195121951219, - "grad_norm": 1.7873961925506592, - "learning_rate": 1.0900421924783272e-06, - "loss": 0.022, - "step": 1417 - }, - { - "epoch": 6.917073170731707, - "grad_norm": 2.0345218181610107, - "learning_rate": 1.0868800881226962e-06, - "loss": 0.0261, - "step": 1418 - }, - { - "epoch": 6.921951219512195, - "grad_norm": 3.086538314819336, - "learning_rate": 1.0837213024896764e-06, - "loss": 0.0257, - "step": 1419 - }, - { - "epoch": 6.926829268292683, - "grad_norm": 2.9401397705078125, - "learning_rate": 1.080565842997698e-06, - "loss": 0.087, - "step": 1420 - }, - { - "epoch": 6.931707317073171, - "grad_norm": 1.305415153503418, - "learning_rate": 1.0774137170573826e-06, - "loss": 0.0147, - "step": 1421 - }, - { - "epoch": 6.9365853658536585, - "grad_norm": 3.0256683826446533, - "learning_rate": 1.074264932071521e-06, - "loss": 0.1183, - "step": 1422 - }, - { - "epoch": 6.941463414634146, - "grad_norm": 2.3618743419647217, - "learning_rate": 1.0711194954350568e-06, - "loss": 0.0186, - "step": 1423 - }, - { - "epoch": 6.946341463414634, - "grad_norm": 2.004451036453247, - "learning_rate": 1.0679774145350735e-06, - "loss": 0.0222, - "step": 1424 - }, - { - "epoch": 6.951219512195122, - "grad_norm": 3.089723587036133, - "learning_rate": 1.0648386967507703e-06, - "loss": 0.0824, - "step": 1425 - }, - { - "epoch": 6.95609756097561, - "grad_norm": 1.9310235977172852, - "learning_rate": 1.0617033494534486e-06, - "loss": 0.0247, - "step": 1426 - }, - { - "epoch": 6.9609756097560975, - "grad_norm": 1.973836898803711, - "learning_rate": 1.0585713800064964e-06, - "loss": 0.0142, - "step": 1427 - }, - { - "epoch": 6.965853658536585, - "grad_norm": 2.9914112091064453, - "learning_rate": 1.0554427957653663e-06, - "loss": 0.0681, - "step": 1428 - }, - { - "epoch": 6.970731707317073, - "grad_norm": 3.356689691543579, - "learning_rate": 1.0523176040775615e-06, - "loss": 0.0916, - "step": 1429 - }, - { - "epoch": 6.975609756097561, - "grad_norm": 2.3305246829986572, - "learning_rate": 1.0491958122826173e-06, - "loss": 0.0611, - "step": 1430 - }, - { - "epoch": 6.980487804878049, - "grad_norm": 1.7383835315704346, - "learning_rate": 1.0460774277120866e-06, - "loss": 0.0182, - "step": 1431 - }, - { - "epoch": 6.985365853658537, - "grad_norm": 2.585674524307251, - "learning_rate": 1.0429624576895177e-06, - "loss": 0.0084, - "step": 1432 - }, - { - "epoch": 6.990243902439024, - "grad_norm": 3.023864269256592, - "learning_rate": 1.03985090953044e-06, - "loss": 0.0411, - "step": 1433 - }, - { - "epoch": 6.995121951219512, - "grad_norm": 2.281674861907959, - "learning_rate": 1.0367427905423497e-06, - "loss": 0.0464, - "step": 1434 - }, - { - "epoch": 7.0, - "grad_norm": 1.4372339248657227, - "learning_rate": 1.0336381080246858e-06, - "loss": 0.0124, - "step": 1435 - }, - { - "epoch": 7.004878048780488, - "grad_norm": 1.9526969194412231, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.0179, - "step": 1436 - }, - { - "epoch": 7.009756097560976, - "grad_norm": 1.7297903299331665, - "learning_rate": 1.027439081558029e-06, - "loss": 0.0119, - "step": 1437 - }, - { - "epoch": 7.014634146341463, - "grad_norm": 2.2754275798797607, - "learning_rate": 1.0243447521674967e-06, - "loss": 0.0278, - "step": 1438 - }, - { - "epoch": 7.019512195121951, - "grad_norm": 5.485769271850586, - "learning_rate": 1.021253888364276e-06, - "loss": 0.1259, - "step": 1439 - }, - { - "epoch": 7.024390243902439, - "grad_norm": 0.9085121750831604, - "learning_rate": 1.018166497407284e-06, - "loss": 0.0047, - "step": 1440 - }, - { - "epoch": 7.029268292682927, - "grad_norm": 1.0291047096252441, - "learning_rate": 1.0150825865472813e-06, - "loss": 0.0044, - "step": 1441 - }, - { - "epoch": 7.034146341463415, - "grad_norm": 0.8040009140968323, - "learning_rate": 1.0120021630268542e-06, - "loss": 0.0044, - "step": 1442 - }, - { - "epoch": 7.0390243902439025, - "grad_norm": 1.3701342344284058, - "learning_rate": 1.0089252340804025e-06, - "loss": 0.0081, - "step": 1443 - }, - { - "epoch": 7.04390243902439, - "grad_norm": 2.89591646194458, - "learning_rate": 1.0058518069341152e-06, - "loss": 0.0318, - "step": 1444 - }, - { - "epoch": 7.048780487804878, - "grad_norm": 1.3153692483901978, - "learning_rate": 1.002781888805958e-06, - "loss": 0.0067, - "step": 1445 - }, - { - "epoch": 7.053658536585366, - "grad_norm": 1.4490022659301758, - "learning_rate": 9.997154869056588e-07, - "loss": 0.0064, - "step": 1446 - }, - { - "epoch": 7.058536585365854, - "grad_norm": 1.7938638925552368, - "learning_rate": 9.966526084346837e-07, - "loss": 0.0057, - "step": 1447 - }, - { - "epoch": 7.0634146341463415, - "grad_norm": 3.7182836532592773, - "learning_rate": 9.935932605862258e-07, - "loss": 0.0365, - "step": 1448 - }, - { - "epoch": 7.068292682926829, - "grad_norm": 1.7843579053878784, - "learning_rate": 9.905374505451853e-07, - "loss": 0.0345, - "step": 1449 - }, - { - "epoch": 7.073170731707317, - "grad_norm": 2.9557483196258545, - "learning_rate": 9.874851854881565e-07, - "loss": 0.0384, - "step": 1450 - }, - { - "epoch": 7.078048780487805, - "grad_norm": 1.6237356662750244, - "learning_rate": 9.844364725834058e-07, - "loss": 0.0116, - "step": 1451 - }, - { - "epoch": 7.082926829268293, - "grad_norm": 3.7120912075042725, - "learning_rate": 9.813913189908571e-07, - "loss": 0.0267, - "step": 1452 - }, - { - "epoch": 7.087804878048781, - "grad_norm": 1.9991087913513184, - "learning_rate": 9.783497318620783e-07, - "loss": 0.0376, - "step": 1453 - }, - { - "epoch": 7.092682926829268, - "grad_norm": 1.5474026203155518, - "learning_rate": 9.75311718340258e-07, - "loss": 0.0057, - "step": 1454 - }, - { - "epoch": 7.097560975609756, - "grad_norm": 2.060807943344116, - "learning_rate": 9.722772855601927e-07, - "loss": 0.0386, - "step": 1455 - }, - { - "epoch": 7.102439024390244, - "grad_norm": 1.1991411447525024, - "learning_rate": 9.692464406482727e-07, - "loss": 0.006, - "step": 1456 - }, - { - "epoch": 7.107317073170732, - "grad_norm": 1.8907703161239624, - "learning_rate": 9.662191907224582e-07, - "loss": 0.0066, - "step": 1457 - }, - { - "epoch": 7.11219512195122, - "grad_norm": 2.0351309776306152, - "learning_rate": 9.63195542892268e-07, - "loss": 0.0201, - "step": 1458 - }, - { - "epoch": 7.117073170731707, - "grad_norm": 1.3973944187164307, - "learning_rate": 9.601755042587624e-07, - "loss": 0.0112, - "step": 1459 - }, - { - "epoch": 7.121951219512195, - "grad_norm": 1.3639394044876099, - "learning_rate": 9.571590819145244e-07, - "loss": 0.0066, - "step": 1460 - }, - { - "epoch": 7.126829268292683, - "grad_norm": 1.7362885475158691, - "learning_rate": 9.541462829436426e-07, - "loss": 0.0136, - "step": 1461 - }, - { - "epoch": 7.131707317073171, - "grad_norm": 2.9414384365081787, - "learning_rate": 9.511371144217005e-07, - "loss": 0.0228, - "step": 1462 - }, - { - "epoch": 7.136585365853659, - "grad_norm": 2.944575548171997, - "learning_rate": 9.481315834157512e-07, - "loss": 0.027, - "step": 1463 - }, - { - "epoch": 7.1414634146341465, - "grad_norm": 2.4692747592926025, - "learning_rate": 9.451296969843058e-07, - "loss": 0.0152, - "step": 1464 - }, - { - "epoch": 7.146341463414634, - "grad_norm": 1.804129719734192, - "learning_rate": 9.42131462177319e-07, - "loss": 0.0071, - "step": 1465 - }, - { - "epoch": 7.151219512195122, - "grad_norm": 1.8012168407440186, - "learning_rate": 9.39136886036166e-07, - "loss": 0.0054, - "step": 1466 - }, - { - "epoch": 7.15609756097561, - "grad_norm": 1.9471648931503296, - "learning_rate": 9.361459755936316e-07, - "loss": 0.0067, - "step": 1467 - }, - { - "epoch": 7.160975609756098, - "grad_norm": 1.8837870359420776, - "learning_rate": 9.331587378738902e-07, - "loss": 0.0105, - "step": 1468 - }, - { - "epoch": 7.1658536585365855, - "grad_norm": 2.358891487121582, - "learning_rate": 9.301751798924935e-07, - "loss": 0.0331, - "step": 1469 - }, - { - "epoch": 7.170731707317073, - "grad_norm": 1.1501671075820923, - "learning_rate": 9.27195308656349e-07, - "loss": 0.0076, - "step": 1470 - }, - { - "epoch": 7.175609756097561, - "grad_norm": 2.3329083919525146, - "learning_rate": 9.24219131163705e-07, - "loss": 0.0243, - "step": 1471 - }, - { - "epoch": 7.180487804878049, - "grad_norm": 1.6030691862106323, - "learning_rate": 9.212466544041385e-07, - "loss": 0.0051, - "step": 1472 - }, - { - "epoch": 7.185365853658537, - "grad_norm": 2.005582094192505, - "learning_rate": 9.182778853585325e-07, - "loss": 0.0146, - "step": 1473 - }, - { - "epoch": 7.190243902439025, - "grad_norm": 1.86012601852417, - "learning_rate": 9.153128309990622e-07, - "loss": 0.0273, - "step": 1474 - }, - { - "epoch": 7.195121951219512, - "grad_norm": 2.218923568725586, - "learning_rate": 9.123514982891813e-07, - "loss": 0.0225, - "step": 1475 - }, - { - "epoch": 7.2, - "grad_norm": 1.9950376749038696, - "learning_rate": 9.093938941836012e-07, - "loss": 0.0156, - "step": 1476 - }, - { - "epoch": 7.204878048780488, - "grad_norm": 1.6428661346435547, - "learning_rate": 9.064400256282757e-07, - "loss": 0.0158, - "step": 1477 - }, - { - "epoch": 7.209756097560976, - "grad_norm": 1.7983390092849731, - "learning_rate": 9.034898995603894e-07, - "loss": 0.0138, - "step": 1478 - }, - { - "epoch": 7.214634146341464, - "grad_norm": 2.2069218158721924, - "learning_rate": 9.00543522908334e-07, - "loss": 0.0308, - "step": 1479 - }, - { - "epoch": 7.219512195121951, - "grad_norm": 1.4668920040130615, - "learning_rate": 8.976009025916962e-07, - "loss": 0.006, - "step": 1480 - }, - { - "epoch": 7.224390243902439, - "grad_norm": 1.8956354856491089, - "learning_rate": 8.946620455212438e-07, - "loss": 0.0121, - "step": 1481 - }, - { - "epoch": 7.229268292682927, - "grad_norm": 2.5479676723480225, - "learning_rate": 8.917269585989027e-07, - "loss": 0.0424, - "step": 1482 - }, - { - "epoch": 7.234146341463415, - "grad_norm": 1.7482987642288208, - "learning_rate": 8.887956487177462e-07, - "loss": 0.0189, - "step": 1483 - }, - { - "epoch": 7.239024390243903, - "grad_norm": 1.5023657083511353, - "learning_rate": 8.858681227619789e-07, - "loss": 0.0118, - "step": 1484 - }, - { - "epoch": 7.2439024390243905, - "grad_norm": 1.2069121599197388, - "learning_rate": 8.829443876069163e-07, - "loss": 0.0043, - "step": 1485 - }, - { - "epoch": 7.248780487804878, - "grad_norm": 1.5843572616577148, - "learning_rate": 8.800244501189722e-07, - "loss": 0.0111, - "step": 1486 - }, - { - "epoch": 7.253658536585366, - "grad_norm": 2.541588544845581, - "learning_rate": 8.771083171556407e-07, - "loss": 0.0582, - "step": 1487 - }, - { - "epoch": 7.258536585365854, - "grad_norm": 0.9306992292404175, - "learning_rate": 8.741959955654833e-07, - "loss": 0.0051, - "step": 1488 - }, - { - "epoch": 7.263414634146342, - "grad_norm": 1.4105901718139648, - "learning_rate": 8.712874921881082e-07, - "loss": 0.0175, - "step": 1489 - }, - { - "epoch": 7.2682926829268295, - "grad_norm": 2.8943028450012207, - "learning_rate": 8.683828138541559e-07, - "loss": 0.0827, - "step": 1490 - }, - { - "epoch": 7.273170731707317, - "grad_norm": 2.512991428375244, - "learning_rate": 8.654819673852874e-07, - "loss": 0.0347, - "step": 1491 - }, - { - "epoch": 7.278048780487805, - "grad_norm": 1.6571681499481201, - "learning_rate": 8.625849595941608e-07, - "loss": 0.0055, - "step": 1492 - }, - { - "epoch": 7.282926829268293, - "grad_norm": 1.3162294626235962, - "learning_rate": 8.596917972844199e-07, - "loss": 0.0043, - "step": 1493 - }, - { - "epoch": 7.287804878048781, - "grad_norm": 1.761405110359192, - "learning_rate": 8.568024872506792e-07, - "loss": 0.0176, - "step": 1494 - }, - { - "epoch": 7.2926829268292686, - "grad_norm": 0.7546011805534363, - "learning_rate": 8.539170362785043e-07, - "loss": 0.0025, - "step": 1495 - }, - { - "epoch": 7.297560975609756, - "grad_norm": 1.6910885572433472, - "learning_rate": 8.510354511443975e-07, - "loss": 0.0093, - "step": 1496 - }, - { - "epoch": 7.302439024390244, - "grad_norm": 1.6627765893936157, - "learning_rate": 8.48157738615784e-07, - "loss": 0.0066, - "step": 1497 - }, - { - "epoch": 7.307317073170732, - "grad_norm": 0.8881242871284485, - "learning_rate": 8.452839054509926e-07, - "loss": 0.0055, - "step": 1498 - }, - { - "epoch": 7.31219512195122, - "grad_norm": 1.0791494846343994, - "learning_rate": 8.42413958399241e-07, - "loss": 0.0059, - "step": 1499 - }, - { - "epoch": 7.317073170731708, - "grad_norm": 1.5198945999145508, - "learning_rate": 8.39547904200623e-07, - "loss": 0.0049, - "step": 1500 - }, - { - "epoch": 7.321951219512195, - "grad_norm": 1.7168906927108765, - "learning_rate": 8.366857495860869e-07, - "loss": 0.0204, - "step": 1501 - }, - { - "epoch": 7.326829268292683, - "grad_norm": 1.70030677318573, - "learning_rate": 8.338275012774247e-07, - "loss": 0.0161, - "step": 1502 - }, - { - "epoch": 7.331707317073171, - "grad_norm": 2.1044130325317383, - "learning_rate": 8.309731659872522e-07, - "loss": 0.0088, - "step": 1503 - }, - { - "epoch": 7.336585365853659, - "grad_norm": 1.5040123462677002, - "learning_rate": 8.281227504189992e-07, - "loss": 0.0204, - "step": 1504 - }, - { - "epoch": 7.341463414634147, - "grad_norm": 1.6814212799072266, - "learning_rate": 8.252762612668869e-07, - "loss": 0.0238, - "step": 1505 - }, - { - "epoch": 7.3463414634146345, - "grad_norm": 2.2541606426239014, - "learning_rate": 8.224337052159154e-07, - "loss": 0.0063, - "step": 1506 - }, - { - "epoch": 7.351219512195122, - "grad_norm": 2.3999500274658203, - "learning_rate": 8.195950889418503e-07, - "loss": 0.0123, - "step": 1507 - }, - { - "epoch": 7.35609756097561, - "grad_norm": 2.8464221954345703, - "learning_rate": 8.167604191112021e-07, - "loss": 0.0296, - "step": 1508 - }, - { - "epoch": 7.360975609756098, - "grad_norm": 2.178104877471924, - "learning_rate": 8.139297023812131e-07, - "loss": 0.0148, - "step": 1509 - }, - { - "epoch": 7.365853658536586, - "grad_norm": 1.6489804983139038, - "learning_rate": 8.111029453998448e-07, - "loss": 0.0057, - "step": 1510 - }, - { - "epoch": 7.3707317073170735, - "grad_norm": 1.9705169200897217, - "learning_rate": 8.082801548057553e-07, - "loss": 0.0098, - "step": 1511 - }, - { - "epoch": 7.375609756097561, - "grad_norm": 1.2231075763702393, - "learning_rate": 8.05461337228289e-07, - "loss": 0.007, - "step": 1512 - }, - { - "epoch": 7.380487804878049, - "grad_norm": 1.5212552547454834, - "learning_rate": 8.026464992874617e-07, - "loss": 0.0058, - "step": 1513 - }, - { - "epoch": 7.385365853658537, - "grad_norm": 0.5752282738685608, - "learning_rate": 7.998356475939398e-07, - "loss": 0.0011, - "step": 1514 - }, - { - "epoch": 7.390243902439025, - "grad_norm": 1.3227447271347046, - "learning_rate": 7.970287887490289e-07, - "loss": 0.0041, - "step": 1515 - }, - { - "epoch": 7.3951219512195125, - "grad_norm": 1.2051570415496826, - "learning_rate": 7.942259293446594e-07, - "loss": 0.0027, - "step": 1516 - }, - { - "epoch": 7.4, - "grad_norm": 1.4740777015686035, - "learning_rate": 7.914270759633669e-07, - "loss": 0.006, - "step": 1517 - }, - { - "epoch": 7.404878048780488, - "grad_norm": 1.8853001594543457, - "learning_rate": 7.886322351782782e-07, - "loss": 0.0066, - "step": 1518 - }, - { - "epoch": 7.409756097560976, - "grad_norm": 1.907251238822937, - "learning_rate": 7.858414135530995e-07, - "loss": 0.0133, - "step": 1519 - }, - { - "epoch": 7.414634146341464, - "grad_norm": 1.3397895097732544, - "learning_rate": 7.83054617642095e-07, - "loss": 0.0092, - "step": 1520 - }, - { - "epoch": 7.419512195121952, - "grad_norm": 2.878927707672119, - "learning_rate": 7.802718539900761e-07, - "loss": 0.0113, - "step": 1521 - }, - { - "epoch": 7.424390243902439, - "grad_norm": 1.0312106609344482, - "learning_rate": 7.774931291323826e-07, - "loss": 0.0045, - "step": 1522 - }, - { - "epoch": 7.429268292682927, - "grad_norm": 2.2703888416290283, - "learning_rate": 7.747184495948723e-07, - "loss": 0.0692, - "step": 1523 - }, - { - "epoch": 7.434146341463415, - "grad_norm": 3.0323078632354736, - "learning_rate": 7.719478218939e-07, - "loss": 0.0462, - "step": 1524 - }, - { - "epoch": 7.439024390243903, - "grad_norm": 1.4211952686309814, - "learning_rate": 7.691812525363044e-07, - "loss": 0.008, - "step": 1525 - }, - { - "epoch": 7.443902439024391, - "grad_norm": 0.9588236808776855, - "learning_rate": 7.66418748019396e-07, - "loss": 0.0042, - "step": 1526 - }, - { - "epoch": 7.4487804878048784, - "grad_norm": 2.837219476699829, - "learning_rate": 7.636603148309363e-07, - "loss": 0.0033, - "step": 1527 - }, - { - "epoch": 7.453658536585366, - "grad_norm": 1.8552638292312622, - "learning_rate": 7.609059594491253e-07, - "loss": 0.0181, - "step": 1528 - }, - { - "epoch": 7.458536585365854, - "grad_norm": 4.836069583892822, - "learning_rate": 7.581556883425886e-07, - "loss": 0.1868, - "step": 1529 - }, - { - "epoch": 7.463414634146342, - "grad_norm": 2.180760622024536, - "learning_rate": 7.55409507970358e-07, - "loss": 0.0305, - "step": 1530 - }, - { - "epoch": 7.46829268292683, - "grad_norm": 1.0799378156661987, - "learning_rate": 7.526674247818569e-07, - "loss": 0.0027, - "step": 1531 - }, - { - "epoch": 7.473170731707317, - "grad_norm": 2.1196658611297607, - "learning_rate": 7.499294452168904e-07, - "loss": 0.019, - "step": 1532 - }, - { - "epoch": 7.478048780487805, - "grad_norm": 1.6932553052902222, - "learning_rate": 7.471955757056227e-07, - "loss": 0.0101, - "step": 1533 - }, - { - "epoch": 7.482926829268292, - "grad_norm": 1.3473751544952393, - "learning_rate": 7.444658226685656e-07, - "loss": 0.0066, - "step": 1534 - }, - { - "epoch": 7.487804878048781, - "grad_norm": 2.3404016494750977, - "learning_rate": 7.417401925165666e-07, - "loss": 0.0139, - "step": 1535 - }, - { - "epoch": 7.492682926829268, - "grad_norm": 1.2845433950424194, - "learning_rate": 7.390186916507869e-07, - "loss": 0.0053, - "step": 1536 - }, - { - "epoch": 7.4975609756097565, - "grad_norm": 1.0809649229049683, - "learning_rate": 7.363013264626914e-07, - "loss": 0.0031, - "step": 1537 - }, - { - "epoch": 7.5024390243902435, - "grad_norm": 2.2649292945861816, - "learning_rate": 7.335881033340334e-07, - "loss": 0.0257, - "step": 1538 - }, - { - "epoch": 7.507317073170732, - "grad_norm": 1.3488918542861938, - "learning_rate": 7.308790286368373e-07, - "loss": 0.0092, - "step": 1539 - }, - { - "epoch": 7.512195121951219, - "grad_norm": 2.239190101623535, - "learning_rate": 7.281741087333846e-07, - "loss": 0.024, - "step": 1540 - }, - { - "epoch": 7.517073170731708, - "grad_norm": 1.9454522132873535, - "learning_rate": 7.254733499761993e-07, - "loss": 0.0177, - "step": 1541 - }, - { - "epoch": 7.521951219512195, - "grad_norm": 1.9299415349960327, - "learning_rate": 7.22776758708035e-07, - "loss": 0.0439, - "step": 1542 - }, - { - "epoch": 7.526829268292683, - "grad_norm": 2.2676074504852295, - "learning_rate": 7.200843412618555e-07, - "loss": 0.0387, - "step": 1543 - }, - { - "epoch": 7.53170731707317, - "grad_norm": 1.2385426759719849, - "learning_rate": 7.173961039608227e-07, - "loss": 0.0082, - "step": 1544 - }, - { - "epoch": 7.536585365853659, - "grad_norm": 1.8637615442276, - "learning_rate": 7.147120531182828e-07, - "loss": 0.0194, - "step": 1545 - }, - { - "epoch": 7.541463414634146, - "grad_norm": 1.6695958375930786, - "learning_rate": 7.120321950377487e-07, - "loss": 0.006, - "step": 1546 - }, - { - "epoch": 7.546341463414635, - "grad_norm": 1.916746735572815, - "learning_rate": 7.093565360128863e-07, - "loss": 0.0104, - "step": 1547 - }, - { - "epoch": 7.5512195121951216, - "grad_norm": 1.6002378463745117, - "learning_rate": 7.066850823275024e-07, - "loss": 0.0173, - "step": 1548 - }, - { - "epoch": 7.55609756097561, - "grad_norm": 1.5249438285827637, - "learning_rate": 7.040178402555245e-07, - "loss": 0.0088, - "step": 1549 - }, - { - "epoch": 7.560975609756097, - "grad_norm": 2.1726534366607666, - "learning_rate": 7.013548160609901e-07, - "loss": 0.0098, - "step": 1550 - }, - { - "epoch": 7.565853658536585, - "grad_norm": 1.901904582977295, - "learning_rate": 6.986960159980327e-07, - "loss": 0.0196, - "step": 1551 - }, - { - "epoch": 7.570731707317073, - "grad_norm": 2.577242136001587, - "learning_rate": 6.960414463108631e-07, - "loss": 0.021, - "step": 1552 - }, - { - "epoch": 7.575609756097561, - "grad_norm": 1.4463082551956177, - "learning_rate": 6.933911132337575e-07, - "loss": 0.0076, - "step": 1553 - }, - { - "epoch": 7.580487804878048, - "grad_norm": 2.5811946392059326, - "learning_rate": 6.907450229910443e-07, - "loss": 0.0204, - "step": 1554 - }, - { - "epoch": 7.585365853658536, - "grad_norm": 1.0530297756195068, - "learning_rate": 6.881031817970848e-07, - "loss": 0.0046, - "step": 1555 - }, - { - "epoch": 7.590243902439024, - "grad_norm": 2.995915651321411, - "learning_rate": 6.854655958562625e-07, - "loss": 0.0566, - "step": 1556 - }, - { - "epoch": 7.595121951219512, - "grad_norm": 1.253089189529419, - "learning_rate": 6.82832271362969e-07, - "loss": 0.0048, - "step": 1557 - }, - { - "epoch": 7.6, - "grad_norm": 2.830667495727539, - "learning_rate": 6.802032145015855e-07, - "loss": 0.0351, - "step": 1558 - }, - { - "epoch": 7.6048780487804875, - "grad_norm": 2.8280539512634277, - "learning_rate": 6.775784314464717e-07, - "loss": 0.0171, - "step": 1559 - }, - { - "epoch": 7.609756097560975, - "grad_norm": 1.7876580953598022, - "learning_rate": 6.749579283619492e-07, - "loss": 0.01, - "step": 1560 - }, - { - "epoch": 7.614634146341463, - "grad_norm": 1.540212869644165, - "learning_rate": 6.723417114022907e-07, - "loss": 0.0162, - "step": 1561 - }, - { - "epoch": 7.619512195121951, - "grad_norm": 2.5126969814300537, - "learning_rate": 6.697297867117e-07, - "loss": 0.0237, - "step": 1562 - }, - { - "epoch": 7.624390243902439, - "grad_norm": 1.5419458150863647, - "learning_rate": 6.671221604243014e-07, - "loss": 0.0116, - "step": 1563 - }, - { - "epoch": 7.6292682926829265, - "grad_norm": 3.469961404800415, - "learning_rate": 6.645188386641257e-07, - "loss": 0.0506, - "step": 1564 - }, - { - "epoch": 7.634146341463414, - "grad_norm": 0.8771130442619324, - "learning_rate": 6.61919827545093e-07, - "loss": 0.002, - "step": 1565 - }, - { - "epoch": 7.639024390243902, - "grad_norm": 3.036559820175171, - "learning_rate": 6.593251331709993e-07, - "loss": 0.0673, - "step": 1566 - }, - { - "epoch": 7.64390243902439, - "grad_norm": 3.379220724105835, - "learning_rate": 6.567347616355049e-07, - "loss": 0.063, - "step": 1567 - }, - { - "epoch": 7.648780487804878, - "grad_norm": 0.7666990756988525, - "learning_rate": 6.541487190221163e-07, - "loss": 0.003, - "step": 1568 - }, - { - "epoch": 7.6536585365853655, - "grad_norm": 1.2181665897369385, - "learning_rate": 6.515670114041725e-07, - "loss": 0.0037, - "step": 1569 - }, - { - "epoch": 7.658536585365853, - "grad_norm": 1.0194541215896606, - "learning_rate": 6.489896448448349e-07, - "loss": 0.0043, - "step": 1570 - }, - { - "epoch": 7.663414634146341, - "grad_norm": 2.2625741958618164, - "learning_rate": 6.464166253970672e-07, - "loss": 0.0144, - "step": 1571 - }, - { - "epoch": 7.668292682926829, - "grad_norm": 1.0256692171096802, - "learning_rate": 6.43847959103624e-07, - "loss": 0.0029, - "step": 1572 - }, - { - "epoch": 7.673170731707317, - "grad_norm": 2.0418128967285156, - "learning_rate": 6.412836519970383e-07, - "loss": 0.0144, - "step": 1573 - }, - { - "epoch": 7.678048780487805, - "grad_norm": 0.8498746752738953, - "learning_rate": 6.387237100996041e-07, - "loss": 0.0026, - "step": 1574 - }, - { - "epoch": 7.682926829268292, - "grad_norm": 1.1043775081634521, - "learning_rate": 6.361681394233631e-07, - "loss": 0.0093, - "step": 1575 - }, - { - "epoch": 7.68780487804878, - "grad_norm": 1.064835786819458, - "learning_rate": 6.336169459700933e-07, - "loss": 0.0081, - "step": 1576 - }, - { - "epoch": 7.692682926829268, - "grad_norm": 1.2024056911468506, - "learning_rate": 6.310701357312909e-07, - "loss": 0.0054, - "step": 1577 - }, - { - "epoch": 7.697560975609756, - "grad_norm": 1.9509804248809814, - "learning_rate": 6.285277146881588e-07, - "loss": 0.0051, - "step": 1578 - }, - { - "epoch": 7.702439024390244, - "grad_norm": 1.8738386631011963, - "learning_rate": 6.259896888115904e-07, - "loss": 0.0118, - "step": 1579 - }, - { - "epoch": 7.7073170731707314, - "grad_norm": 1.356726884841919, - "learning_rate": 6.234560640621606e-07, - "loss": 0.009, - "step": 1580 - }, - { - "epoch": 7.712195121951219, - "grad_norm": 0.6530736684799194, - "learning_rate": 6.209268463901047e-07, - "loss": 0.0015, - "step": 1581 - }, - { - "epoch": 7.717073170731707, - "grad_norm": 1.3714262247085571, - "learning_rate": 6.184020417353084e-07, - "loss": 0.0051, - "step": 1582 - }, - { - "epoch": 7.721951219512195, - "grad_norm": 3.015583038330078, - "learning_rate": 6.158816560272962e-07, - "loss": 0.0383, - "step": 1583 - }, - { - "epoch": 7.726829268292683, - "grad_norm": 3.2355704307556152, - "learning_rate": 6.133656951852113e-07, - "loss": 0.0422, - "step": 1584 - }, - { - "epoch": 7.7317073170731705, - "grad_norm": 1.2933087348937988, - "learning_rate": 6.10854165117806e-07, - "loss": 0.0082, - "step": 1585 - }, - { - "epoch": 7.736585365853658, - "grad_norm": 1.6866157054901123, - "learning_rate": 6.083470717234285e-07, - "loss": 0.0052, - "step": 1586 - }, - { - "epoch": 7.741463414634146, - "grad_norm": 1.4597362279891968, - "learning_rate": 6.058444208900061e-07, - "loss": 0.0094, - "step": 1587 - }, - { - "epoch": 7.746341463414634, - "grad_norm": 0.9200596213340759, - "learning_rate": 6.033462184950317e-07, - "loss": 0.0034, - "step": 1588 - }, - { - "epoch": 7.751219512195122, - "grad_norm": 1.707422137260437, - "learning_rate": 6.008524704055535e-07, - "loss": 0.0141, - "step": 1589 - }, - { - "epoch": 7.7560975609756095, - "grad_norm": 1.8554565906524658, - "learning_rate": 5.983631824781572e-07, - "loss": 0.0108, - "step": 1590 - }, - { - "epoch": 7.760975609756097, - "grad_norm": 1.5421279668807983, - "learning_rate": 5.95878360558953e-07, - "loss": 0.0075, - "step": 1591 - }, - { - "epoch": 7.765853658536585, - "grad_norm": 1.5643326044082642, - "learning_rate": 5.933980104835652e-07, - "loss": 0.018, - "step": 1592 - }, - { - "epoch": 7.770731707317073, - "grad_norm": 1.7024025917053223, - "learning_rate": 5.909221380771132e-07, - "loss": 0.0207, - "step": 1593 - }, - { - "epoch": 7.775609756097561, - "grad_norm": 1.820544719696045, - "learning_rate": 5.884507491542024e-07, - "loss": 0.0217, - "step": 1594 - }, - { - "epoch": 7.780487804878049, - "grad_norm": 1.6761897802352905, - "learning_rate": 5.859838495189068e-07, - "loss": 0.0055, - "step": 1595 - }, - { - "epoch": 7.785365853658536, - "grad_norm": 2.3035616874694824, - "learning_rate": 5.835214449647602e-07, - "loss": 0.0147, - "step": 1596 - }, - { - "epoch": 7.790243902439024, - "grad_norm": 2.0507681369781494, - "learning_rate": 5.810635412747373e-07, - "loss": 0.0065, - "step": 1597 - }, - { - "epoch": 7.795121951219512, - "grad_norm": 1.3789564371109009, - "learning_rate": 5.786101442212422e-07, - "loss": 0.0077, - "step": 1598 - }, - { - "epoch": 7.8, - "grad_norm": 3.313107490539551, - "learning_rate": 5.761612595660979e-07, - "loss": 0.0699, - "step": 1599 - }, - { - "epoch": 7.804878048780488, - "grad_norm": 1.2391237020492554, - "learning_rate": 5.737168930605272e-07, - "loss": 0.0017, - "step": 1600 - }, - { - "epoch": 7.809756097560975, - "grad_norm": 1.1187714338302612, - "learning_rate": 5.712770504451426e-07, - "loss": 0.0101, - "step": 1601 - }, - { - "epoch": 7.814634146341463, - "grad_norm": 2.7611069679260254, - "learning_rate": 5.688417374499336e-07, - "loss": 0.0143, - "step": 1602 - }, - { - "epoch": 7.819512195121951, - "grad_norm": 1.627295732498169, - "learning_rate": 5.664109597942504e-07, - "loss": 0.0062, - "step": 1603 - }, - { - "epoch": 7.824390243902439, - "grad_norm": 4.538354396820068, - "learning_rate": 5.639847231867917e-07, - "loss": 0.1058, - "step": 1604 - }, - { - "epoch": 7.829268292682927, - "grad_norm": 1.783469319343567, - "learning_rate": 5.61563033325594e-07, - "loss": 0.0178, - "step": 1605 - }, - { - "epoch": 7.8341463414634145, - "grad_norm": 2.259584665298462, - "learning_rate": 5.591458958980123e-07, - "loss": 0.0204, - "step": 1606 - }, - { - "epoch": 7.839024390243902, - "grad_norm": 2.0741965770721436, - "learning_rate": 5.567333165807115e-07, - "loss": 0.0201, - "step": 1607 - }, - { - "epoch": 7.84390243902439, - "grad_norm": 0.8751707077026367, - "learning_rate": 5.543253010396538e-07, - "loss": 0.0077, - "step": 1608 - }, - { - "epoch": 7.848780487804878, - "grad_norm": 1.7383732795715332, - "learning_rate": 5.519218549300806e-07, - "loss": 0.0176, - "step": 1609 - }, - { - "epoch": 7.853658536585366, - "grad_norm": 2.0462191104888916, - "learning_rate": 5.495229838965021e-07, - "loss": 0.031, - "step": 1610 - }, - { - "epoch": 7.8585365853658535, - "grad_norm": 1.3201459646224976, - "learning_rate": 5.471286935726866e-07, - "loss": 0.0062, - "step": 1611 - }, - { - "epoch": 7.863414634146341, - "grad_norm": 2.9285616874694824, - "learning_rate": 5.447389895816416e-07, - "loss": 0.0615, - "step": 1612 - }, - { - "epoch": 7.868292682926829, - "grad_norm": 3.1918647289276123, - "learning_rate": 5.423538775356049e-07, - "loss": 0.0377, - "step": 1613 - }, - { - "epoch": 7.873170731707317, - "grad_norm": 1.406246542930603, - "learning_rate": 5.399733630360287e-07, - "loss": 0.0122, - "step": 1614 - }, - { - "epoch": 7.878048780487805, - "grad_norm": 1.7651537656784058, - "learning_rate": 5.375974516735713e-07, - "loss": 0.015, - "step": 1615 - }, - { - "epoch": 7.882926829268293, - "grad_norm": 1.9614673852920532, - "learning_rate": 5.352261490280767e-07, - "loss": 0.0058, - "step": 1616 - }, - { - "epoch": 7.88780487804878, - "grad_norm": 1.6031639575958252, - "learning_rate": 5.328594606685661e-07, - "loss": 0.0041, - "step": 1617 - }, - { - "epoch": 7.892682926829268, - "grad_norm": 0.9787303805351257, - "learning_rate": 5.304973921532264e-07, - "loss": 0.0067, - "step": 1618 - }, - { - "epoch": 7.897560975609756, - "grad_norm": 1.2693779468536377, - "learning_rate": 5.281399490293923e-07, - "loss": 0.0064, - "step": 1619 - }, - { - "epoch": 7.902439024390244, - "grad_norm": 1.8421361446380615, - "learning_rate": 5.257871368335357e-07, - "loss": 0.0182, - "step": 1620 - }, - { - "epoch": 7.907317073170732, - "grad_norm": 0.9667096138000488, - "learning_rate": 5.234389610912552e-07, - "loss": 0.0024, - "step": 1621 - }, - { - "epoch": 7.912195121951219, - "grad_norm": 3.2266018390655518, - "learning_rate": 5.210954273172578e-07, - "loss": 0.02, - "step": 1622 - }, - { - "epoch": 7.917073170731707, - "grad_norm": 1.5821634531021118, - "learning_rate": 5.187565410153497e-07, - "loss": 0.024, - "step": 1623 - }, - { - "epoch": 7.921951219512195, - "grad_norm": 1.9864275455474854, - "learning_rate": 5.164223076784239e-07, - "loss": 0.0103, - "step": 1624 - }, - { - "epoch": 7.926829268292683, - "grad_norm": 1.866466999053955, - "learning_rate": 5.14092732788444e-07, - "loss": 0.0268, - "step": 1625 - }, - { - "epoch": 7.931707317073171, - "grad_norm": 1.165686011314392, - "learning_rate": 5.117678218164337e-07, - "loss": 0.0085, - "step": 1626 - }, - { - "epoch": 7.9365853658536585, - "grad_norm": 1.1883208751678467, - "learning_rate": 5.094475802224644e-07, - "loss": 0.006, - "step": 1627 - }, - { - "epoch": 7.941463414634146, - "grad_norm": 1.5121057033538818, - "learning_rate": 5.071320134556404e-07, - "loss": 0.003, - "step": 1628 - }, - { - "epoch": 7.946341463414634, - "grad_norm": 1.1923614740371704, - "learning_rate": 5.048211269540868e-07, - "loss": 0.0064, - "step": 1629 - }, - { - "epoch": 7.951219512195122, - "grad_norm": 1.33751380443573, - "learning_rate": 5.025149261449391e-07, - "loss": 0.0082, - "step": 1630 - }, - { - "epoch": 7.95609756097561, - "grad_norm": 1.9143925905227661, - "learning_rate": 5.002134164443262e-07, - "loss": 0.0202, - "step": 1631 - }, - { - "epoch": 7.9609756097560975, - "grad_norm": 1.2547078132629395, - "learning_rate": 4.979166032573607e-07, - "loss": 0.0033, - "step": 1632 - }, - { - "epoch": 7.965853658536585, - "grad_norm": 2.3050332069396973, - "learning_rate": 4.956244919781247e-07, - "loss": 0.052, - "step": 1633 - }, - { - "epoch": 7.970731707317073, - "grad_norm": 1.4462478160858154, - "learning_rate": 4.933370879896604e-07, - "loss": 0.0049, - "step": 1634 - }, - { - "epoch": 7.975609756097561, - "grad_norm": 1.519913911819458, - "learning_rate": 4.91054396663952e-07, - "loss": 0.0102, - "step": 1635 - }, - { - "epoch": 7.980487804878049, - "grad_norm": 2.9544193744659424, - "learning_rate": 4.887764233619163e-07, - "loss": 0.0112, - "step": 1636 - }, - { - "epoch": 7.985365853658537, - "grad_norm": 0.9778392314910889, - "learning_rate": 4.865031734333919e-07, - "loss": 0.0032, - "step": 1637 - }, - { - "epoch": 7.990243902439024, - "grad_norm": 2.783501386642456, - "learning_rate": 4.842346522171226e-07, - "loss": 0.012, - "step": 1638 - }, - { - "epoch": 7.995121951219512, - "grad_norm": 1.5644093751907349, - "learning_rate": 4.819708650407467e-07, - "loss": 0.0184, - "step": 1639 - }, - { - "epoch": 8.0, - "grad_norm": 1.5741018056869507, - "learning_rate": 4.797118172207863e-07, - "loss": 0.0112, - "step": 1640 - }, - { - "epoch": 8.004878048780487, - "grad_norm": 0.9010241031646729, - "learning_rate": 4.774575140626317e-07, - "loss": 0.0064, - "step": 1641 - }, - { - "epoch": 8.009756097560976, - "grad_norm": 0.8204272985458374, - "learning_rate": 4.752079608605295e-07, - "loss": 0.003, - "step": 1642 - }, - { - "epoch": 8.014634146341463, - "grad_norm": 1.8131763935089111, - "learning_rate": 4.7296316289757366e-07, - "loss": 0.0063, - "step": 1643 - }, - { - "epoch": 8.019512195121951, - "grad_norm": 0.9918075799942017, - "learning_rate": 4.7072312544568844e-07, - "loss": 0.0039, - "step": 1644 - }, - { - "epoch": 8.024390243902438, - "grad_norm": 0.5097177028656006, - "learning_rate": 4.6848785376561733e-07, - "loss": 0.0028, - "step": 1645 - }, - { - "epoch": 8.029268292682927, - "grad_norm": 0.3497299253940582, - "learning_rate": 4.6625735310691396e-07, - "loss": 0.0021, - "step": 1646 - }, - { - "epoch": 8.034146341463414, - "grad_norm": 0.9271900057792664, - "learning_rate": 4.6403162870792524e-07, - "loss": 0.005, - "step": 1647 - }, - { - "epoch": 8.039024390243902, - "grad_norm": 0.951755940914154, - "learning_rate": 4.618106857957805e-07, - "loss": 0.0042, - "step": 1648 - }, - { - "epoch": 8.04390243902439, - "grad_norm": 0.6863508820533752, - "learning_rate": 4.5959452958638213e-07, - "loss": 0.0014, - "step": 1649 - }, - { - "epoch": 8.048780487804878, - "grad_norm": 0.45382270216941833, - "learning_rate": 4.573831652843888e-07, - "loss": 0.0012, - "step": 1650 - }, - { - "epoch": 8.053658536585365, - "grad_norm": 1.8319289684295654, - "learning_rate": 4.55176598083206e-07, - "loss": 0.0234, - "step": 1651 - }, - { - "epoch": 8.058536585365854, - "grad_norm": 1.2312507629394531, - "learning_rate": 4.5297483316497276e-07, - "loss": 0.0042, - "step": 1652 - }, - { - "epoch": 8.06341463414634, - "grad_norm": 1.4057971239089966, - "learning_rate": 4.5077787570055097e-07, - "loss": 0.0085, - "step": 1653 - }, - { - "epoch": 8.06829268292683, - "grad_norm": 3.3510940074920654, - "learning_rate": 4.4858573084951173e-07, - "loss": 0.0628, - "step": 1654 - }, - { - "epoch": 8.073170731707316, - "grad_norm": 0.6469231247901917, - "learning_rate": 4.463984037601224e-07, - "loss": 0.0026, - "step": 1655 - }, - { - "epoch": 8.078048780487805, - "grad_norm": 0.9491491317749023, - "learning_rate": 4.4421589956933827e-07, - "loss": 0.0021, - "step": 1656 - }, - { - "epoch": 8.082926829268292, - "grad_norm": 1.0847301483154297, - "learning_rate": 4.420382234027859e-07, - "loss": 0.0042, - "step": 1657 - }, - { - "epoch": 8.08780487804878, - "grad_norm": 0.5364987254142761, - "learning_rate": 4.398653803747532e-07, - "loss": 0.0045, - "step": 1658 - }, - { - "epoch": 8.092682926829267, - "grad_norm": 1.057804822921753, - "learning_rate": 4.3769737558817996e-07, - "loss": 0.0015, - "step": 1659 - }, - { - "epoch": 8.097560975609756, - "grad_norm": 1.2050957679748535, - "learning_rate": 4.355342141346405e-07, - "loss": 0.0124, - "step": 1660 - }, - { - "epoch": 8.102439024390243, - "grad_norm": 0.2821386754512787, - "learning_rate": 4.3337590109433505e-07, - "loss": 0.002, - "step": 1661 - }, - { - "epoch": 8.107317073170732, - "grad_norm": 0.7883970141410828, - "learning_rate": 4.3122244153607914e-07, - "loss": 0.0013, - "step": 1662 - }, - { - "epoch": 8.112195121951219, - "grad_norm": 1.1907166242599487, - "learning_rate": 4.2907384051728754e-07, - "loss": 0.0201, - "step": 1663 - }, - { - "epoch": 8.117073170731707, - "grad_norm": 1.3646314144134521, - "learning_rate": 4.2693010308396566e-07, - "loss": 0.0039, - "step": 1664 - }, - { - "epoch": 8.121951219512194, - "grad_norm": 2.0689423084259033, - "learning_rate": 4.247912342706975e-07, - "loss": 0.0035, - "step": 1665 - }, - { - "epoch": 8.126829268292683, - "grad_norm": 0.4086499810218811, - "learning_rate": 4.22657239100632e-07, - "loss": 0.0009, - "step": 1666 - }, - { - "epoch": 8.13170731707317, - "grad_norm": 0.9431869387626648, - "learning_rate": 4.2052812258547265e-07, - "loss": 0.0018, - "step": 1667 - }, - { - "epoch": 8.136585365853659, - "grad_norm": 0.9063575863838196, - "learning_rate": 4.184038897254655e-07, - "loss": 0.0021, - "step": 1668 - }, - { - "epoch": 8.141463414634146, - "grad_norm": 2.707298517227173, - "learning_rate": 4.1628454550938697e-07, - "loss": 0.019, - "step": 1669 - }, - { - "epoch": 8.146341463414634, - "grad_norm": 1.687988042831421, - "learning_rate": 4.141700949145322e-07, - "loss": 0.0144, - "step": 1670 - }, - { - "epoch": 8.151219512195121, - "grad_norm": 0.8905831575393677, - "learning_rate": 4.1206054290670537e-07, - "loss": 0.0088, - "step": 1671 - }, - { - "epoch": 8.15609756097561, - "grad_norm": 1.418512225151062, - "learning_rate": 4.0995589444020433e-07, - "loss": 0.0083, - "step": 1672 - }, - { - "epoch": 8.160975609756097, - "grad_norm": 1.1676236391067505, - "learning_rate": 4.0785615445781106e-07, - "loss": 0.0027, - "step": 1673 - }, - { - "epoch": 8.165853658536586, - "grad_norm": 1.5615407228469849, - "learning_rate": 4.057613278907818e-07, - "loss": 0.0089, - "step": 1674 - }, - { - "epoch": 8.170731707317072, - "grad_norm": 1.0604172945022583, - "learning_rate": 4.036714196588318e-07, - "loss": 0.0034, - "step": 1675 - }, - { - "epoch": 8.175609756097561, - "grad_norm": 1.3175733089447021, - "learning_rate": 4.015864346701251e-07, - "loss": 0.0021, - "step": 1676 - }, - { - "epoch": 8.180487804878048, - "grad_norm": 0.2539370059967041, - "learning_rate": 3.99506377821266e-07, - "loss": 0.0005, - "step": 1677 - }, - { - "epoch": 8.185365853658537, - "grad_norm": 0.8106228113174438, - "learning_rate": 3.97431253997283e-07, - "loss": 0.003, - "step": 1678 - }, - { - "epoch": 8.190243902439024, - "grad_norm": 0.6703351140022278, - "learning_rate": 3.9536106807161857e-07, - "loss": 0.0028, - "step": 1679 - }, - { - "epoch": 8.195121951219512, - "grad_norm": 1.2921632528305054, - "learning_rate": 3.932958249061214e-07, - "loss": 0.0097, - "step": 1680 - }, - { - "epoch": 8.2, - "grad_norm": 0.7795253992080688, - "learning_rate": 3.9123552935102976e-07, - "loss": 0.004, - "step": 1681 - }, - { - "epoch": 8.204878048780488, - "grad_norm": 1.3402642011642456, - "learning_rate": 3.891801862449629e-07, - "loss": 0.0189, - "step": 1682 - }, - { - "epoch": 8.209756097560975, - "grad_norm": 0.6951391696929932, - "learning_rate": 3.8712980041490905e-07, - "loss": 0.0038, - "step": 1683 - }, - { - "epoch": 8.214634146341464, - "grad_norm": 0.8145114183425903, - "learning_rate": 3.850843766762155e-07, - "loss": 0.0038, - "step": 1684 - }, - { - "epoch": 8.21951219512195, - "grad_norm": 0.30702775716781616, - "learning_rate": 3.830439198325747e-07, - "loss": 0.0008, - "step": 1685 - }, - { - "epoch": 8.22439024390244, - "grad_norm": 0.45050138235092163, - "learning_rate": 3.81008434676014e-07, - "loss": 0.0013, - "step": 1686 - }, - { - "epoch": 8.229268292682926, - "grad_norm": 0.7875486612319946, - "learning_rate": 3.789779259868864e-07, - "loss": 0.0016, - "step": 1687 - }, - { - "epoch": 8.234146341463415, - "grad_norm": 0.9437265396118164, - "learning_rate": 3.769523985338566e-07, - "loss": 0.0045, - "step": 1688 - }, - { - "epoch": 8.239024390243902, - "grad_norm": 1.2928845882415771, - "learning_rate": 3.749318570738897e-07, - "loss": 0.0057, - "step": 1689 - }, - { - "epoch": 8.24390243902439, - "grad_norm": 0.9615103006362915, - "learning_rate": 3.7291630635224397e-07, - "loss": 0.0026, - "step": 1690 - }, - { - "epoch": 8.248780487804877, - "grad_norm": 0.8654932975769043, - "learning_rate": 3.709057511024541e-07, - "loss": 0.0056, - "step": 1691 - }, - { - "epoch": 8.253658536585366, - "grad_norm": 1.1101908683776855, - "learning_rate": 3.689001960463243e-07, - "loss": 0.0019, - "step": 1692 - }, - { - "epoch": 8.258536585365853, - "grad_norm": 0.9586653709411621, - "learning_rate": 3.668996458939156e-07, - "loss": 0.003, - "step": 1693 - }, - { - "epoch": 8.263414634146342, - "grad_norm": 1.1638360023498535, - "learning_rate": 3.649041053435354e-07, - "loss": 0.0031, - "step": 1694 - }, - { - "epoch": 8.268292682926829, - "grad_norm": 0.41364336013793945, - "learning_rate": 3.62913579081724e-07, - "loss": 0.0012, - "step": 1695 - }, - { - "epoch": 8.273170731707317, - "grad_norm": 1.1794198751449585, - "learning_rate": 3.609280717832489e-07, - "loss": 0.0067, - "step": 1696 - }, - { - "epoch": 8.278048780487804, - "grad_norm": 0.7281041741371155, - "learning_rate": 3.5894758811108795e-07, - "loss": 0.002, - "step": 1697 - }, - { - "epoch": 8.282926829268293, - "grad_norm": 0.42419376969337463, - "learning_rate": 3.5697213271642164e-07, - "loss": 0.0008, - "step": 1698 - }, - { - "epoch": 8.28780487804878, - "grad_norm": 0.6596572995185852, - "learning_rate": 3.5500171023862136e-07, - "loss": 0.0028, - "step": 1699 - }, - { - "epoch": 8.292682926829269, - "grad_norm": 1.236666202545166, - "learning_rate": 3.530363253052399e-07, - "loss": 0.0032, - "step": 1700 - }, - { - "epoch": 8.297560975609755, - "grad_norm": 0.977694571018219, - "learning_rate": 3.510759825319976e-07, - "loss": 0.0068, - "step": 1701 - }, - { - "epoch": 8.302439024390244, - "grad_norm": 1.0168365240097046, - "learning_rate": 3.491206865227739e-07, - "loss": 0.0017, - "step": 1702 - }, - { - "epoch": 8.307317073170731, - "grad_norm": 2.269639253616333, - "learning_rate": 3.4717044186959676e-07, - "loss": 0.0398, - "step": 1703 - }, - { - "epoch": 8.31219512195122, - "grad_norm": 1.0657192468643188, - "learning_rate": 3.452252531526301e-07, - "loss": 0.0049, - "step": 1704 - }, - { - "epoch": 8.317073170731707, - "grad_norm": 1.50715970993042, - "learning_rate": 3.432851249401628e-07, - "loss": 0.0164, - "step": 1705 - }, - { - "epoch": 8.321951219512195, - "grad_norm": 0.701214611530304, - "learning_rate": 3.413500617886023e-07, - "loss": 0.0038, - "step": 1706 - }, - { - "epoch": 8.326829268292682, - "grad_norm": 1.6810981035232544, - "learning_rate": 3.394200682424578e-07, - "loss": 0.0118, - "step": 1707 - }, - { - "epoch": 8.331707317073171, - "grad_norm": 1.4712997674942017, - "learning_rate": 3.374951488343328e-07, - "loss": 0.006, - "step": 1708 - }, - { - "epoch": 8.336585365853658, - "grad_norm": 0.6115317940711975, - "learning_rate": 3.355753080849164e-07, - "loss": 0.0011, - "step": 1709 - }, - { - "epoch": 8.341463414634147, - "grad_norm": 0.8171163201332092, - "learning_rate": 3.3366055050296776e-07, - "loss": 0.0024, - "step": 1710 - }, - { - "epoch": 8.346341463414634, - "grad_norm": 0.7722201943397522, - "learning_rate": 3.3175088058530925e-07, - "loss": 0.0028, - "step": 1711 - }, - { - "epoch": 8.351219512195122, - "grad_norm": 3.0709617137908936, - "learning_rate": 3.2984630281681556e-07, - "loss": 0.0109, - "step": 1712 - }, - { - "epoch": 8.35609756097561, - "grad_norm": 1.7634369134902954, - "learning_rate": 3.2794682167040125e-07, - "loss": 0.0031, - "step": 1713 - }, - { - "epoch": 8.360975609756098, - "grad_norm": 1.7657215595245361, - "learning_rate": 3.2605244160701155e-07, - "loss": 0.01, - "step": 1714 - }, - { - "epoch": 8.365853658536585, - "grad_norm": 1.432230830192566, - "learning_rate": 3.2416316707561316e-07, - "loss": 0.0042, - "step": 1715 - }, - { - "epoch": 8.370731707317074, - "grad_norm": 0.465900719165802, - "learning_rate": 3.2227900251318055e-07, - "loss": 0.0021, - "step": 1716 - }, - { - "epoch": 8.37560975609756, - "grad_norm": 1.3770387172698975, - "learning_rate": 3.2039995234468854e-07, - "loss": 0.0031, - "step": 1717 - }, - { - "epoch": 8.38048780487805, - "grad_norm": 0.4842236638069153, - "learning_rate": 3.1852602098309984e-07, - "loss": 0.0009, - "step": 1718 - }, - { - "epoch": 8.385365853658536, - "grad_norm": 0.6840565204620361, - "learning_rate": 3.1665721282935683e-07, - "loss": 0.0047, - "step": 1719 - }, - { - "epoch": 8.390243902439025, - "grad_norm": 0.5206313729286194, - "learning_rate": 3.147935322723694e-07, - "loss": 0.0026, - "step": 1720 - }, - { - "epoch": 8.395121951219512, - "grad_norm": 1.131412386894226, - "learning_rate": 3.1293498368900414e-07, - "loss": 0.0019, - "step": 1721 - }, - { - "epoch": 8.4, - "grad_norm": 0.5872076153755188, - "learning_rate": 3.1108157144407765e-07, - "loss": 0.0009, - "step": 1722 - }, - { - "epoch": 8.404878048780487, - "grad_norm": 1.1455132961273193, - "learning_rate": 3.092332998903416e-07, - "loss": 0.0047, - "step": 1723 - }, - { - "epoch": 8.409756097560976, - "grad_norm": 1.4331532716751099, - "learning_rate": 3.073901733684748e-07, - "loss": 0.0162, - "step": 1724 - }, - { - "epoch": 8.414634146341463, - "grad_norm": 0.8186633586883545, - "learning_rate": 3.055521962070751e-07, - "loss": 0.0078, - "step": 1725 - }, - { - "epoch": 8.419512195121952, - "grad_norm": 0.9004407525062561, - "learning_rate": 3.0371937272264454e-07, - "loss": 0.0035, - "step": 1726 - }, - { - "epoch": 8.424390243902439, - "grad_norm": 0.8009728789329529, - "learning_rate": 3.0189170721958234e-07, - "loss": 0.0011, - "step": 1727 - }, - { - "epoch": 8.429268292682927, - "grad_norm": 0.7846589088439941, - "learning_rate": 3.000692039901756e-07, - "loss": 0.0042, - "step": 1728 - }, - { - "epoch": 8.434146341463414, - "grad_norm": 1.2301117181777954, - "learning_rate": 2.982518673145862e-07, - "loss": 0.0159, - "step": 1729 - }, - { - "epoch": 8.439024390243903, - "grad_norm": 0.8503583073616028, - "learning_rate": 2.9643970146084193e-07, - "loss": 0.0021, - "step": 1730 - }, - { - "epoch": 8.44390243902439, - "grad_norm": 1.661842942237854, - "learning_rate": 2.9463271068482955e-07, - "loss": 0.0124, - "step": 1731 - }, - { - "epoch": 8.448780487804878, - "grad_norm": 0.7799263000488281, - "learning_rate": 2.928308992302792e-07, - "loss": 0.0038, - "step": 1732 - }, - { - "epoch": 8.453658536585365, - "grad_norm": 0.6021434664726257, - "learning_rate": 2.9103427132875785e-07, - "loss": 0.0013, - "step": 1733 - }, - { - "epoch": 8.458536585365854, - "grad_norm": 1.430431604385376, - "learning_rate": 2.892428311996609e-07, - "loss": 0.0151, - "step": 1734 - }, - { - "epoch": 8.463414634146341, - "grad_norm": 1.1589592695236206, - "learning_rate": 2.8745658305019824e-07, - "loss": 0.0037, - "step": 1735 - }, - { - "epoch": 8.46829268292683, - "grad_norm": 0.7232568860054016, - "learning_rate": 2.856755310753867e-07, - "loss": 0.0046, - "step": 1736 - }, - { - "epoch": 8.473170731707317, - "grad_norm": 0.6265125274658203, - "learning_rate": 2.8389967945803984e-07, - "loss": 0.0014, - "step": 1737 - }, - { - "epoch": 8.478048780487805, - "grad_norm": 0.7115193009376526, - "learning_rate": 2.821290323687592e-07, - "loss": 0.0036, - "step": 1738 - }, - { - "epoch": 8.482926829268292, - "grad_norm": 0.5157519578933716, - "learning_rate": 2.803635939659222e-07, - "loss": 0.0016, - "step": 1739 - }, - { - "epoch": 8.487804878048781, - "grad_norm": 0.9217156767845154, - "learning_rate": 2.786033683956732e-07, - "loss": 0.0052, - "step": 1740 - }, - { - "epoch": 8.492682926829268, - "grad_norm": 4.063957691192627, - "learning_rate": 2.7684835979191664e-07, - "loss": 0.0999, - "step": 1741 - }, - { - "epoch": 8.497560975609757, - "grad_norm": 0.38870275020599365, - "learning_rate": 2.7509857227630223e-07, - "loss": 0.0009, - "step": 1742 - }, - { - "epoch": 8.502439024390243, - "grad_norm": 0.8282430768013, - "learning_rate": 2.733540099582188e-07, - "loss": 0.0026, - "step": 1743 - }, - { - "epoch": 8.507317073170732, - "grad_norm": 1.7269257307052612, - "learning_rate": 2.7161467693478493e-07, - "loss": 0.0094, - "step": 1744 - }, - { - "epoch": 8.512195121951219, - "grad_norm": 1.4464598894119263, - "learning_rate": 2.6988057729083613e-07, - "loss": 0.006, - "step": 1745 - }, - { - "epoch": 8.517073170731708, - "grad_norm": 0.9648481011390686, - "learning_rate": 2.681517150989185e-07, - "loss": 0.0043, - "step": 1746 - }, - { - "epoch": 8.521951219512195, - "grad_norm": 0.7762707471847534, - "learning_rate": 2.664280944192782e-07, - "loss": 0.0026, - "step": 1747 - }, - { - "epoch": 8.526829268292683, - "grad_norm": 0.9751222133636475, - "learning_rate": 2.64709719299851e-07, - "loss": 0.0044, - "step": 1748 - }, - { - "epoch": 8.53170731707317, - "grad_norm": 0.5906254053115845, - "learning_rate": 2.6299659377625296e-07, - "loss": 0.0008, - "step": 1749 - }, - { - "epoch": 8.536585365853659, - "grad_norm": 1.9417753219604492, - "learning_rate": 2.612887218717733e-07, - "loss": 0.0324, - "step": 1750 - }, - { - "epoch": 8.541463414634146, - "grad_norm": 0.6434907913208008, - "learning_rate": 2.5958610759736133e-07, - "loss": 0.0028, - "step": 1751 - }, - { - "epoch": 8.546341463414635, - "grad_norm": 0.8546578884124756, - "learning_rate": 2.5788875495161846e-07, - "loss": 0.0019, - "step": 1752 - }, - { - "epoch": 8.551219512195122, - "grad_norm": 0.8363909721374512, - "learning_rate": 2.561966679207917e-07, - "loss": 0.0028, - "step": 1753 - }, - { - "epoch": 8.55609756097561, - "grad_norm": 1.4901739358901978, - "learning_rate": 2.545098504787588e-07, - "loss": 0.0266, - "step": 1754 - }, - { - "epoch": 8.560975609756097, - "grad_norm": 0.6730532646179199, - "learning_rate": 2.5282830658702323e-07, - "loss": 0.0009, - "step": 1755 - }, - { - "epoch": 8.565853658536586, - "grad_norm": 0.7190845608711243, - "learning_rate": 2.511520401947032e-07, - "loss": 0.0056, - "step": 1756 - }, - { - "epoch": 8.570731707317073, - "grad_norm": 0.441381573677063, - "learning_rate": 2.494810552385232e-07, - "loss": 0.0009, - "step": 1757 - }, - { - "epoch": 8.575609756097561, - "grad_norm": 1.103507399559021, - "learning_rate": 2.47815355642804e-07, - "loss": 0.0023, - "step": 1758 - }, - { - "epoch": 8.580487804878048, - "grad_norm": 1.994994878768921, - "learning_rate": 2.461549453194523e-07, - "loss": 0.0454, - "step": 1759 - }, - { - "epoch": 8.585365853658537, - "grad_norm": 2.3645970821380615, - "learning_rate": 2.444998281679553e-07, - "loss": 0.0204, - "step": 1760 - }, - { - "epoch": 8.590243902439024, - "grad_norm": 1.7933200597763062, - "learning_rate": 2.428500080753676e-07, - "loss": 0.0387, - "step": 1761 - }, - { - "epoch": 8.595121951219513, - "grad_norm": 1.6070597171783447, - "learning_rate": 2.412054889163035e-07, - "loss": 0.0014, - "step": 1762 - }, - { - "epoch": 8.6, - "grad_norm": 0.2842216193675995, - "learning_rate": 2.3956627455292924e-07, - "loss": 0.0011, - "step": 1763 - }, - { - "epoch": 8.604878048780488, - "grad_norm": 0.8213078379631042, - "learning_rate": 2.3793236883495164e-07, - "loss": 0.003, - "step": 1764 - }, - { - "epoch": 8.609756097560975, - "grad_norm": 0.9147091507911682, - "learning_rate": 2.363037755996095e-07, - "loss": 0.0032, - "step": 1765 - }, - { - "epoch": 8.614634146341464, - "grad_norm": 1.4246805906295776, - "learning_rate": 2.3468049867166747e-07, - "loss": 0.0037, - "step": 1766 - }, - { - "epoch": 8.61951219512195, - "grad_norm": 0.5553964376449585, - "learning_rate": 2.3306254186340305e-07, - "loss": 0.0014, - "step": 1767 - }, - { - "epoch": 8.62439024390244, - "grad_norm": 1.6941331624984741, - "learning_rate": 2.314499089745989e-07, - "loss": 0.0125, - "step": 1768 - }, - { - "epoch": 8.629268292682926, - "grad_norm": 2.965517520904541, - "learning_rate": 2.2984260379253693e-07, - "loss": 0.0855, - "step": 1769 - }, - { - "epoch": 8.634146341463415, - "grad_norm": 0.9295977354049683, - "learning_rate": 2.2824063009198428e-07, - "loss": 0.0031, - "step": 1770 - }, - { - "epoch": 8.639024390243902, - "grad_norm": 0.990189254283905, - "learning_rate": 2.2664399163518786e-07, - "loss": 0.0056, - "step": 1771 - }, - { - "epoch": 8.64390243902439, - "grad_norm": 1.7282871007919312, - "learning_rate": 2.25052692171866e-07, - "loss": 0.022, - "step": 1772 - }, - { - "epoch": 8.648780487804878, - "grad_norm": 1.2093932628631592, - "learning_rate": 2.2346673543919645e-07, - "loss": 0.0025, - "step": 1773 - }, - { - "epoch": 8.653658536585366, - "grad_norm": 0.9555385112762451, - "learning_rate": 2.2188612516181067e-07, - "loss": 0.0081, - "step": 1774 - }, - { - "epoch": 8.658536585365853, - "grad_norm": 0.7467104196548462, - "learning_rate": 2.203108650517835e-07, - "loss": 0.0015, - "step": 1775 - }, - { - "epoch": 8.663414634146342, - "grad_norm": 0.893450140953064, - "learning_rate": 2.1874095880862505e-07, - "loss": 0.0023, - "step": 1776 - }, - { - "epoch": 8.668292682926829, - "grad_norm": 1.0488923788070679, - "learning_rate": 2.171764101192722e-07, - "loss": 0.002, - "step": 1777 - }, - { - "epoch": 8.673170731707318, - "grad_norm": 1.1046003103256226, - "learning_rate": 2.1561722265807827e-07, - "loss": 0.002, - "step": 1778 - }, - { - "epoch": 8.678048780487805, - "grad_norm": 0.38860198855400085, - "learning_rate": 2.1406340008680748e-07, - "loss": 0.0015, - "step": 1779 - }, - { - "epoch": 8.682926829268293, - "grad_norm": 0.9970881938934326, - "learning_rate": 2.1251494605462358e-07, - "loss": 0.0028, - "step": 1780 - }, - { - "epoch": 8.68780487804878, - "grad_norm": 0.32808956503868103, - "learning_rate": 2.1097186419808151e-07, - "loss": 0.0008, - "step": 1781 - }, - { - "epoch": 8.692682926829269, - "grad_norm": 0.25458696484565735, - "learning_rate": 2.094341581411216e-07, - "loss": 0.0012, - "step": 1782 - }, - { - "epoch": 8.697560975609756, - "grad_norm": 0.3530316948890686, - "learning_rate": 2.0790183149505733e-07, - "loss": 0.0021, - "step": 1783 - }, - { - "epoch": 8.702439024390245, - "grad_norm": 0.6706930994987488, - "learning_rate": 2.063748878585689e-07, - "loss": 0.0028, - "step": 1784 - }, - { - "epoch": 8.707317073170731, - "grad_norm": 0.9568914175033569, - "learning_rate": 2.0485333081769588e-07, - "loss": 0.0018, - "step": 1785 - }, - { - "epoch": 8.71219512195122, - "grad_norm": 1.2713409662246704, - "learning_rate": 2.0333716394582536e-07, - "loss": 0.0142, - "step": 1786 - }, - { - "epoch": 8.717073170731707, - "grad_norm": 1.7427871227264404, - "learning_rate": 2.0182639080368634e-07, - "loss": 0.0135, - "step": 1787 - }, - { - "epoch": 8.721951219512196, - "grad_norm": 0.8939143419265747, - "learning_rate": 2.003210149393417e-07, - "loss": 0.0078, - "step": 1788 - }, - { - "epoch": 8.726829268292683, - "grad_norm": 1.1459598541259766, - "learning_rate": 1.9882103988817735e-07, - "loss": 0.0066, - "step": 1789 - }, - { - "epoch": 8.731707317073171, - "grad_norm": 0.875706672668457, - "learning_rate": 1.9732646917289545e-07, - "loss": 0.0051, - "step": 1790 - }, - { - "epoch": 8.736585365853658, - "grad_norm": 0.2884235084056854, - "learning_rate": 1.958373063035071e-07, - "loss": 0.001, - "step": 1791 - }, - { - "epoch": 8.741463414634147, - "grad_norm": 1.3679368495941162, - "learning_rate": 1.9435355477732205e-07, - "loss": 0.0057, - "step": 1792 - }, - { - "epoch": 8.746341463414634, - "grad_norm": 0.5913633108139038, - "learning_rate": 1.928752180789417e-07, - "loss": 0.0023, - "step": 1793 - }, - { - "epoch": 8.751219512195123, - "grad_norm": 1.565428376197815, - "learning_rate": 1.9140229968025058e-07, - "loss": 0.0191, - "step": 1794 - }, - { - "epoch": 8.75609756097561, - "grad_norm": 1.4710811376571655, - "learning_rate": 1.8993480304040912e-07, - "loss": 0.0114, - "step": 1795 - }, - { - "epoch": 8.760975609756098, - "grad_norm": 1.803842306137085, - "learning_rate": 1.8847273160584378e-07, - "loss": 0.0046, - "step": 1796 - }, - { - "epoch": 8.765853658536585, - "grad_norm": 0.694587230682373, - "learning_rate": 1.8701608881023957e-07, - "loss": 0.0014, - "step": 1797 - }, - { - "epoch": 8.770731707317074, - "grad_norm": 0.7563489675521851, - "learning_rate": 1.855648780745342e-07, - "loss": 0.0085, - "step": 1798 - }, - { - "epoch": 8.77560975609756, - "grad_norm": 1.1587045192718506, - "learning_rate": 1.8411910280690588e-07, - "loss": 0.0034, - "step": 1799 - }, - { - "epoch": 8.78048780487805, - "grad_norm": 1.7251181602478027, - "learning_rate": 1.826787664027685e-07, - "loss": 0.0119, - "step": 1800 - }, - { - "epoch": 8.785365853658536, - "grad_norm": 1.3170053958892822, - "learning_rate": 1.8124387224476347e-07, - "loss": 0.0059, - "step": 1801 - }, - { - "epoch": 8.790243902439025, - "grad_norm": 0.927018940448761, - "learning_rate": 1.7981442370274993e-07, - "loss": 0.0021, - "step": 1802 - }, - { - "epoch": 8.795121951219512, - "grad_norm": 2.3129045963287354, - "learning_rate": 1.783904241337983e-07, - "loss": 0.0085, - "step": 1803 - }, - { - "epoch": 8.8, - "grad_norm": 1.1010651588439941, - "learning_rate": 1.7697187688218291e-07, - "loss": 0.0037, - "step": 1804 - }, - { - "epoch": 8.804878048780488, - "grad_norm": 0.3990725576877594, - "learning_rate": 1.7555878527937164e-07, - "loss": 0.0008, - "step": 1805 - }, - { - "epoch": 8.809756097560976, - "grad_norm": 1.022905707359314, - "learning_rate": 1.7415115264402065e-07, - "loss": 0.0092, - "step": 1806 - }, - { - "epoch": 8.814634146341463, - "grad_norm": 0.7391730546951294, - "learning_rate": 1.727489822819664e-07, - "loss": 0.0016, - "step": 1807 - }, - { - "epoch": 8.819512195121952, - "grad_norm": 0.5859627723693848, - "learning_rate": 1.7135227748621585e-07, - "loss": 0.0012, - "step": 1808 - }, - { - "epoch": 8.824390243902439, - "grad_norm": 1.5222235918045044, - "learning_rate": 1.699610415369407e-07, - "loss": 0.0126, - "step": 1809 - }, - { - "epoch": 8.829268292682928, - "grad_norm": 0.8635048270225525, - "learning_rate": 1.6857527770146876e-07, - "loss": 0.0086, - "step": 1810 - }, - { - "epoch": 8.834146341463414, - "grad_norm": 0.8385710120201111, - "learning_rate": 1.6719498923427697e-07, - "loss": 0.0031, - "step": 1811 - }, - { - "epoch": 8.839024390243903, - "grad_norm": 1.0619077682495117, - "learning_rate": 1.6582017937698287e-07, - "loss": 0.0083, - "step": 1812 - }, - { - "epoch": 8.84390243902439, - "grad_norm": 0.6677606701850891, - "learning_rate": 1.6445085135833732e-07, - "loss": 0.002, - "step": 1813 - }, - { - "epoch": 8.848780487804879, - "grad_norm": 0.703705370426178, - "learning_rate": 1.6308700839421793e-07, - "loss": 0.0027, - "step": 1814 - }, - { - "epoch": 8.853658536585366, - "grad_norm": 0.7628077864646912, - "learning_rate": 1.6172865368762004e-07, - "loss": 0.0028, - "step": 1815 - }, - { - "epoch": 8.858536585365854, - "grad_norm": 0.7577258348464966, - "learning_rate": 1.6037579042864876e-07, - "loss": 0.0011, - "step": 1816 - }, - { - "epoch": 8.863414634146341, - "grad_norm": 1.2882269620895386, - "learning_rate": 1.5902842179451482e-07, - "loss": 0.0082, - "step": 1817 - }, - { - "epoch": 8.86829268292683, - "grad_norm": 1.030044436454773, - "learning_rate": 1.576865509495229e-07, - "loss": 0.0068, - "step": 1818 - }, - { - "epoch": 8.873170731707317, - "grad_norm": 1.9678841829299927, - "learning_rate": 1.5635018104506627e-07, - "loss": 0.0085, - "step": 1819 - }, - { - "epoch": 8.878048780487806, - "grad_norm": 0.756213366985321, - "learning_rate": 1.5501931521962055e-07, - "loss": 0.0062, - "step": 1820 - }, - { - "epoch": 8.882926829268293, - "grad_norm": 1.1753418445587158, - "learning_rate": 1.5369395659873305e-07, - "loss": 0.0043, - "step": 1821 - }, - { - "epoch": 8.887804878048781, - "grad_norm": 0.8144367933273315, - "learning_rate": 1.5237410829501864e-07, - "loss": 0.0042, - "step": 1822 - }, - { - "epoch": 8.892682926829268, - "grad_norm": 1.0879873037338257, - "learning_rate": 1.510597734081512e-07, - "loss": 0.0077, - "step": 1823 - }, - { - "epoch": 8.897560975609757, - "grad_norm": 1.7992119789123535, - "learning_rate": 1.497509550248555e-07, - "loss": 0.0013, - "step": 1824 - }, - { - "epoch": 8.902439024390244, - "grad_norm": 1.0460071563720703, - "learning_rate": 1.4844765621890135e-07, - "loss": 0.0091, - "step": 1825 - }, - { - "epoch": 8.907317073170733, - "grad_norm": 1.5372941493988037, - "learning_rate": 1.471498800510962e-07, - "loss": 0.005, - "step": 1826 - }, - { - "epoch": 8.91219512195122, - "grad_norm": 0.3672512173652649, - "learning_rate": 1.4585762956927624e-07, - "loss": 0.0014, - "step": 1827 - }, - { - "epoch": 8.917073170731708, - "grad_norm": 1.0456454753875732, - "learning_rate": 1.4457090780830185e-07, - "loss": 0.0063, - "step": 1828 - }, - { - "epoch": 8.921951219512195, - "grad_norm": 0.9190329909324646, - "learning_rate": 1.432897177900483e-07, - "loss": 0.0065, - "step": 1829 - }, - { - "epoch": 8.926829268292684, - "grad_norm": 1.8261685371398926, - "learning_rate": 1.4201406252340038e-07, - "loss": 0.0099, - "step": 1830 - }, - { - "epoch": 8.93170731707317, - "grad_norm": 1.1341190338134766, - "learning_rate": 1.407439450042433e-07, - "loss": 0.0042, - "step": 1831 - }, - { - "epoch": 8.93658536585366, - "grad_norm": 11.465933799743652, - "learning_rate": 1.3947936821545772e-07, - "loss": 0.004, - "step": 1832 - }, - { - "epoch": 8.941463414634146, - "grad_norm": 0.5747786164283752, - "learning_rate": 1.3822033512691209e-07, - "loss": 0.0009, - "step": 1833 - }, - { - "epoch": 8.946341463414633, - "grad_norm": 1.1908810138702393, - "learning_rate": 1.369668486954545e-07, - "loss": 0.0028, - "step": 1834 - }, - { - "epoch": 8.951219512195122, - "grad_norm": 0.2560107111930847, - "learning_rate": 1.3571891186490687e-07, - "loss": 0.001, - "step": 1835 - }, - { - "epoch": 8.95609756097561, - "grad_norm": 0.5070216059684753, - "learning_rate": 1.3447652756605894e-07, - "loss": 0.0024, - "step": 1836 - }, - { - "epoch": 8.960975609756098, - "grad_norm": 0.507199227809906, - "learning_rate": 1.3323969871665897e-07, - "loss": 0.0015, - "step": 1837 - }, - { - "epoch": 8.965853658536584, - "grad_norm": 0.29779553413391113, - "learning_rate": 1.3200842822140818e-07, - "loss": 0.0007, - "step": 1838 - }, - { - "epoch": 8.970731707317073, - "grad_norm": 0.4603523015975952, - "learning_rate": 1.3078271897195572e-07, - "loss": 0.0018, - "step": 1839 - }, - { - "epoch": 8.975609756097562, - "grad_norm": 1.0771223306655884, - "learning_rate": 1.2956257384688807e-07, - "loss": 0.0063, - "step": 1840 - }, - { - "epoch": 8.980487804878049, - "grad_norm": 0.798372745513916, - "learning_rate": 1.283479957117248e-07, - "loss": 0.002, - "step": 1841 - }, - { - "epoch": 8.985365853658536, - "grad_norm": 2.3283369541168213, - "learning_rate": 1.2713898741891244e-07, - "loss": 0.0398, - "step": 1842 - }, - { - "epoch": 8.990243902439024, - "grad_norm": 0.18683794140815735, - "learning_rate": 1.2593555180781591e-07, - "loss": 0.0004, - "step": 1843 - }, - { - "epoch": 8.995121951219513, - "grad_norm": 2.2289419174194336, - "learning_rate": 1.2473769170471188e-07, - "loss": 0.0713, - "step": 1844 - }, - { - "epoch": 9.0, - "grad_norm": 0.9360214471817017, - "learning_rate": 1.2354540992278452e-07, - "loss": 0.002, - "step": 1845 - }, - { - "epoch": 9.004878048780487, - "grad_norm": 0.11728485673666, - "learning_rate": 1.223587092621162e-07, - "loss": 0.0004, - "step": 1846 - }, - { - "epoch": 9.009756097560976, - "grad_norm": 2.8439087867736816, - "learning_rate": 1.2117759250968225e-07, - "loss": 0.0791, - "step": 1847 - }, - { - "epoch": 9.014634146341463, - "grad_norm": 0.3048456311225891, - "learning_rate": 1.2000206243934358e-07, - "loss": 0.0021, - "step": 1848 - }, - { - "epoch": 9.019512195121951, - "grad_norm": 0.35457128286361694, - "learning_rate": 1.1883212181184212e-07, - "loss": 0.0014, - "step": 1849 - }, - { - "epoch": 9.024390243902438, - "grad_norm": 0.4256647527217865, - "learning_rate": 1.176677733747919e-07, - "loss": 0.003, - "step": 1850 - }, - { - "epoch": 9.029268292682927, - "grad_norm": 0.14073246717453003, - "learning_rate": 1.1650901986267365e-07, - "loss": 0.0009, - "step": 1851 - }, - { - "epoch": 9.034146341463414, - "grad_norm": 0.2287226915359497, - "learning_rate": 1.1535586399682885e-07, - "loss": 0.001, - "step": 1852 - }, - { - "epoch": 9.039024390243902, - "grad_norm": 0.1520719975233078, - "learning_rate": 1.1420830848545256e-07, - "loss": 0.0008, - "step": 1853 - }, - { - "epoch": 9.04390243902439, - "grad_norm": 0.7066623568534851, - "learning_rate": 1.1306635602358673e-07, - "loss": 0.0086, - "step": 1854 - }, - { - "epoch": 9.048780487804878, - "grad_norm": 0.5992008447647095, - "learning_rate": 1.1193000929311638e-07, - "loss": 0.0023, - "step": 1855 - }, - { - "epoch": 9.053658536585365, - "grad_norm": 1.6487441062927246, - "learning_rate": 1.1079927096275978e-07, - "loss": 0.0235, - "step": 1856 - }, - { - "epoch": 9.058536585365854, - "grad_norm": 0.2044752836227417, - "learning_rate": 1.0967414368806384e-07, - "loss": 0.0013, - "step": 1857 - }, - { - "epoch": 9.06341463414634, - "grad_norm": 0.13774175941944122, - "learning_rate": 1.0855463011139905e-07, - "loss": 0.0005, - "step": 1858 - }, - { - "epoch": 9.06829268292683, - "grad_norm": 0.1757974475622177, - "learning_rate": 1.0744073286195089e-07, - "loss": 0.0011, - "step": 1859 - }, - { - "epoch": 9.073170731707316, - "grad_norm": 0.3755623698234558, - "learning_rate": 1.0633245455571511e-07, - "loss": 0.0015, - "step": 1860 - }, - { - "epoch": 9.078048780487805, - "grad_norm": 0.1744445413351059, - "learning_rate": 1.052297977954922e-07, - "loss": 0.001, - "step": 1861 - }, - { - "epoch": 9.082926829268292, - "grad_norm": 0.23634043335914612, - "learning_rate": 1.0413276517087956e-07, - "loss": 0.0005, - "step": 1862 - }, - { - "epoch": 9.08780487804878, - "grad_norm": 0.31559380888938904, - "learning_rate": 1.0304135925826603e-07, - "loss": 0.002, - "step": 1863 - }, - { - "epoch": 9.092682926829267, - "grad_norm": 0.44957175850868225, - "learning_rate": 1.0195558262082683e-07, - "loss": 0.0052, - "step": 1864 - }, - { - "epoch": 9.097560975609756, - "grad_norm": 0.23585057258605957, - "learning_rate": 1.0087543780851666e-07, - "loss": 0.0009, - "step": 1865 - }, - { - "epoch": 9.102439024390243, - "grad_norm": 0.26482903957366943, - "learning_rate": 9.98009273580633e-08, - "loss": 0.0017, - "step": 1866 - }, - { - "epoch": 9.107317073170732, - "grad_norm": 0.21670401096343994, - "learning_rate": 9.87320537929623e-08, - "loss": 0.0006, - "step": 1867 - }, - { - "epoch": 9.112195121951219, - "grad_norm": 0.11546074599027634, - "learning_rate": 9.766881962347208e-08, - "loss": 0.0006, - "step": 1868 - }, - { - "epoch": 9.117073170731707, - "grad_norm": 0.35039573907852173, - "learning_rate": 9.661122734660521e-08, - "loss": 0.0017, - "step": 1869 - }, - { - "epoch": 9.121951219512194, - "grad_norm": 0.25725650787353516, - "learning_rate": 9.555927944612492e-08, - "loss": 0.0015, - "step": 1870 - }, - { - "epoch": 9.126829268292683, - "grad_norm": 1.0865508317947388, - "learning_rate": 9.451297839253915e-08, - "loss": 0.0179, - "step": 1871 - }, - { - "epoch": 9.13170731707317, - "grad_norm": 0.29501980543136597, - "learning_rate": 9.34723266430937e-08, - "loss": 0.0013, - "step": 1872 - }, - { - "epoch": 9.136585365853659, - "grad_norm": 0.3127771019935608, - "learning_rate": 9.243732664176636e-08, - "loss": 0.0013, - "step": 1873 - }, - { - "epoch": 9.141463414634146, - "grad_norm": 0.47584185004234314, - "learning_rate": 9.140798081926277e-08, - "loss": 0.0042, - "step": 1874 - }, - { - "epoch": 9.146341463414634, - "grad_norm": 0.22509703040122986, - "learning_rate": 9.03842915930095e-08, - "loss": 0.0014, - "step": 1875 - }, - { - "epoch": 9.151219512195121, - "grad_norm": 0.2254130244255066, - "learning_rate": 8.936626136714754e-08, - "loss": 0.0014, - "step": 1876 - }, - { - "epoch": 9.15609756097561, - "grad_norm": 0.4184035360813141, - "learning_rate": 8.835389253252918e-08, - "loss": 0.0036, - "step": 1877 - }, - { - "epoch": 9.160975609756097, - "grad_norm": 0.8849661946296692, - "learning_rate": 8.734718746670978e-08, - "loss": 0.0195, - "step": 1878 - }, - { - "epoch": 9.165853658536586, - "grad_norm": 0.3465995192527771, - "learning_rate": 8.634614853394341e-08, - "loss": 0.0009, - "step": 1879 - }, - { - "epoch": 9.170731707317072, - "grad_norm": 0.5498316884040833, - "learning_rate": 8.53507780851781e-08, - "loss": 0.0028, - "step": 1880 - }, - { - "epoch": 9.175609756097561, - "grad_norm": 0.4553240239620209, - "learning_rate": 8.436107845804842e-08, - "loss": 0.0023, - "step": 1881 - }, - { - "epoch": 9.180487804878048, - "grad_norm": 0.3339614272117615, - "learning_rate": 8.33770519768709e-08, - "loss": 0.0008, - "step": 1882 - }, - { - "epoch": 9.185365853658537, - "grad_norm": 0.3212447762489319, - "learning_rate": 8.239870095263974e-08, - "loss": 0.0018, - "step": 1883 - }, - { - "epoch": 9.190243902439024, - "grad_norm": 0.2665475904941559, - "learning_rate": 8.142602768301921e-08, - "loss": 0.0016, - "step": 1884 - }, - { - "epoch": 9.195121951219512, - "grad_norm": 0.8188057541847229, - "learning_rate": 8.045903445233982e-08, - "loss": 0.0063, - "step": 1885 - }, - { - "epoch": 9.2, - "grad_norm": 0.458200603723526, - "learning_rate": 7.949772353159191e-08, - "loss": 0.0049, - "step": 1886 - }, - { - "epoch": 9.204878048780488, - "grad_norm": 0.50230872631073, - "learning_rate": 7.854209717842231e-08, - "loss": 0.0006, - "step": 1887 - }, - { - "epoch": 9.209756097560975, - "grad_norm": 0.12954330444335938, - "learning_rate": 7.759215763712579e-08, - "loss": 0.0007, - "step": 1888 - }, - { - "epoch": 9.214634146341464, - "grad_norm": 0.3889886438846588, - "learning_rate": 7.664790713864223e-08, - "loss": 0.0038, - "step": 1889 - }, - { - "epoch": 9.21951219512195, - "grad_norm": 0.8406491875648499, - "learning_rate": 7.57093479005519e-08, - "loss": 0.0059, - "step": 1890 - }, - { - "epoch": 9.22439024390244, - "grad_norm": 0.27930590510368347, - "learning_rate": 7.477648212706746e-08, - "loss": 0.0009, - "step": 1891 - }, - { - "epoch": 9.229268292682926, - "grad_norm": 0.2927345037460327, - "learning_rate": 7.384931200903084e-08, - "loss": 0.0019, - "step": 1892 - }, - { - "epoch": 9.234146341463415, - "grad_norm": 0.5030691027641296, - "learning_rate": 7.29278397239086e-08, - "loss": 0.0036, - "step": 1893 - }, - { - "epoch": 9.239024390243902, - "grad_norm": 0.14574876427650452, - "learning_rate": 7.20120674357852e-08, - "loss": 0.0005, - "step": 1894 - }, - { - "epoch": 9.24390243902439, - "grad_norm": 0.286927729845047, - "learning_rate": 7.110199729535805e-08, - "loss": 0.0009, - "step": 1895 - }, - { - "epoch": 9.248780487804877, - "grad_norm": 0.44844964146614075, - "learning_rate": 7.019763143993441e-08, - "loss": 0.0047, - "step": 1896 - }, - { - "epoch": 9.253658536585366, - "grad_norm": 0.16901901364326477, - "learning_rate": 6.929897199342395e-08, - "loss": 0.0006, - "step": 1897 - }, - { - "epoch": 9.258536585365853, - "grad_norm": 0.19660663604736328, - "learning_rate": 6.840602106633425e-08, - "loss": 0.0005, - "step": 1898 - }, - { - "epoch": 9.263414634146342, - "grad_norm": 0.2517840564250946, - "learning_rate": 6.751878075576867e-08, - "loss": 0.001, - "step": 1899 - }, - { - "epoch": 9.268292682926829, - "grad_norm": 0.6886439323425293, - "learning_rate": 6.663725314541652e-08, - "loss": 0.0046, - "step": 1900 - }, - { - "epoch": 9.273170731707317, - "grad_norm": 0.2044619917869568, - "learning_rate": 6.576144030555259e-08, - "loss": 0.0009, - "step": 1901 - }, - { - "epoch": 9.278048780487804, - "grad_norm": 0.5199993848800659, - "learning_rate": 6.489134429302906e-08, - "loss": 0.0038, - "step": 1902 - }, - { - "epoch": 9.282926829268293, - "grad_norm": 0.20676910877227783, - "learning_rate": 6.402696715127387e-08, - "loss": 0.0007, - "step": 1903 - }, - { - "epoch": 9.28780487804878, - "grad_norm": 0.13005101680755615, - "learning_rate": 6.316831091028237e-08, - "loss": 0.0005, - "step": 1904 - }, - { - "epoch": 9.292682926829269, - "grad_norm": 0.12870948016643524, - "learning_rate": 6.23153775866156e-08, - "loss": 0.0004, - "step": 1905 - }, - { - "epoch": 9.297560975609755, - "grad_norm": 0.4530372619628906, - "learning_rate": 6.14681691833935e-08, - "loss": 0.0027, - "step": 1906 - }, - { - "epoch": 9.302439024390244, - "grad_norm": 0.14936240017414093, - "learning_rate": 6.062668769029168e-08, - "loss": 0.0006, - "step": 1907 - }, - { - "epoch": 9.307317073170731, - "grad_norm": 1.0447592735290527, - "learning_rate": 5.979093508353489e-08, - "loss": 0.0033, - "step": 1908 - }, - { - "epoch": 9.31219512195122, - "grad_norm": 0.9839334487915039, - "learning_rate": 5.896091332589532e-08, - "loss": 0.0143, - "step": 1909 - }, - { - "epoch": 9.317073170731707, - "grad_norm": 0.13809092342853546, - "learning_rate": 5.813662436668477e-08, - "loss": 0.0006, - "step": 1910 - }, - { - "epoch": 9.321951219512195, - "grad_norm": 0.2679869830608368, - "learning_rate": 5.731807014175195e-08, - "loss": 0.0009, - "step": 1911 - }, - { - "epoch": 9.326829268292682, - "grad_norm": 0.09745966643095016, - "learning_rate": 5.650525257347744e-08, - "loss": 0.0004, - "step": 1912 - }, - { - "epoch": 9.331707317073171, - "grad_norm": 0.15892420709133148, - "learning_rate": 5.569817357076984e-08, - "loss": 0.0012, - "step": 1913 - }, - { - "epoch": 9.336585365853658, - "grad_norm": 0.9430788159370422, - "learning_rate": 5.489683502905935e-08, - "loss": 0.0039, - "step": 1914 - }, - { - "epoch": 9.341463414634147, - "grad_norm": 0.15283145010471344, - "learning_rate": 5.410123883029639e-08, - "loss": 0.001, - "step": 1915 - }, - { - "epoch": 9.346341463414634, - "grad_norm": 0.3713572025299072, - "learning_rate": 5.3311386842944125e-08, - "loss": 0.0017, - "step": 1916 - }, - { - "epoch": 9.351219512195122, - "grad_norm": 0.29313772916793823, - "learning_rate": 5.25272809219754e-08, - "loss": 0.0021, - "step": 1917 - }, - { - "epoch": 9.35609756097561, - "grad_norm": 0.219829261302948, - "learning_rate": 5.17489229088694e-08, - "loss": 0.001, - "step": 1918 - }, - { - "epoch": 9.360975609756098, - "grad_norm": 0.35704150795936584, - "learning_rate": 5.097631463160585e-08, - "loss": 0.002, - "step": 1919 - }, - { - "epoch": 9.365853658536585, - "grad_norm": 0.44924139976501465, - "learning_rate": 5.020945790466025e-08, - "loss": 0.0007, - "step": 1920 - }, - { - "epoch": 9.370731707317074, - "grad_norm": 0.10656553506851196, - "learning_rate": 4.944835452900199e-08, - "loss": 0.0005, - "step": 1921 - }, - { - "epoch": 9.37560975609756, - "grad_norm": 0.8902695178985596, - "learning_rate": 4.869300629208762e-08, - "loss": 0.0084, - "step": 1922 - }, - { - "epoch": 9.38048780487805, - "grad_norm": 0.3544962406158447, - "learning_rate": 4.7943414967858426e-08, - "loss": 0.0024, - "step": 1923 - }, - { - "epoch": 9.385365853658536, - "grad_norm": 0.3950733244419098, - "learning_rate": 4.7199582316734827e-08, - "loss": 0.0006, - "step": 1924 - }, - { - "epoch": 9.390243902439025, - "grad_norm": 0.45972177386283875, - "learning_rate": 4.6461510085613616e-08, - "loss": 0.0012, - "step": 1925 - }, - { - "epoch": 9.395121951219512, - "grad_norm": 0.12690195441246033, - "learning_rate": 4.5729200007862686e-08, - "loss": 0.0004, - "step": 1926 - }, - { - "epoch": 9.4, - "grad_norm": 0.2692466974258423, - "learning_rate": 4.5002653803317975e-08, - "loss": 0.0016, - "step": 1927 - }, - { - "epoch": 9.404878048780487, - "grad_norm": 0.5844394564628601, - "learning_rate": 4.428187317827848e-08, - "loss": 0.001, - "step": 1928 - }, - { - "epoch": 9.409756097560976, - "grad_norm": 0.5482091307640076, - "learning_rate": 4.356685982550263e-08, - "loss": 0.0016, - "step": 1929 - }, - { - "epoch": 9.414634146341463, - "grad_norm": 0.06951025128364563, - "learning_rate": 4.285761542420497e-08, - "loss": 0.0004, - "step": 1930 - }, - { - "epoch": 9.419512195121952, - "grad_norm": 0.1519978791475296, - "learning_rate": 4.215414164005116e-08, - "loss": 0.0007, - "step": 1931 - }, - { - "epoch": 9.424390243902439, - "grad_norm": 0.3855389654636383, - "learning_rate": 4.145644012515465e-08, - "loss": 0.0012, - "step": 1932 - }, - { - "epoch": 9.429268292682927, - "grad_norm": 0.28962311148643494, - "learning_rate": 4.076451251807223e-08, - "loss": 0.0024, - "step": 1933 - }, - { - "epoch": 9.434146341463414, - "grad_norm": 0.12305665761232376, - "learning_rate": 4.0078360443801535e-08, - "loss": 0.0005, - "step": 1934 - }, - { - "epoch": 9.439024390243903, - "grad_norm": 0.5113069415092468, - "learning_rate": 3.9397985513775495e-08, - "loss": 0.0007, - "step": 1935 - }, - { - "epoch": 9.44390243902439, - "grad_norm": 0.11020799726247787, - "learning_rate": 3.872338932585984e-08, - "loss": 0.0006, - "step": 1936 - }, - { - "epoch": 9.448780487804878, - "grad_norm": 0.24607239663600922, - "learning_rate": 3.8054573464348655e-08, - "loss": 0.0012, - "step": 1937 - }, - { - "epoch": 9.453658536585365, - "grad_norm": 0.09522794187068939, - "learning_rate": 3.739153949996105e-08, - "loss": 0.0004, - "step": 1938 - }, - { - "epoch": 9.458536585365854, - "grad_norm": 0.3217187523841858, - "learning_rate": 3.6734288989836994e-08, - "loss": 0.0018, - "step": 1939 - }, - { - "epoch": 9.463414634146341, - "grad_norm": 0.10770946741104126, - "learning_rate": 3.608282347753428e-08, - "loss": 0.0005, - "step": 1940 - }, - { - "epoch": 9.46829268292683, - "grad_norm": 0.18529640138149261, - "learning_rate": 3.543714449302488e-08, - "loss": 0.0014, - "step": 1941 - }, - { - "epoch": 9.473170731707317, - "grad_norm": 0.3584231436252594, - "learning_rate": 3.479725355268998e-08, - "loss": 0.0011, - "step": 1942 - }, - { - "epoch": 9.478048780487805, - "grad_norm": 0.3854292035102844, - "learning_rate": 3.4163152159318866e-08, - "loss": 0.0019, - "step": 1943 - }, - { - "epoch": 9.482926829268292, - "grad_norm": 0.08858831971883774, - "learning_rate": 3.353484180210337e-08, - "loss": 0.0003, - "step": 1944 - }, - { - "epoch": 9.487804878048781, - "grad_norm": 0.5076143741607666, - "learning_rate": 3.291232395663424e-08, - "loss": 0.0057, - "step": 1945 - }, - { - "epoch": 9.492682926829268, - "grad_norm": 0.38053473830223083, - "learning_rate": 3.229560008490007e-08, - "loss": 0.0021, - "step": 1946 - }, - { - "epoch": 9.497560975609757, - "grad_norm": 1.4997718334197998, - "learning_rate": 3.168467163528116e-08, - "loss": 0.0079, - "step": 1947 - }, - { - "epoch": 9.502439024390243, - "grad_norm": 0.7466314435005188, - "learning_rate": 3.1079540042547315e-08, - "loss": 0.0077, - "step": 1948 - }, - { - "epoch": 9.507317073170732, - "grad_norm": 0.11087851971387863, - "learning_rate": 3.0480206727855066e-08, - "loss": 0.0004, - "step": 1949 - }, - { - "epoch": 9.512195121951219, - "grad_norm": 0.2965907156467438, - "learning_rate": 2.988667309874294e-08, - "loss": 0.0015, - "step": 1950 - }, - { - "epoch": 9.517073170731708, - "grad_norm": 1.4327231645584106, - "learning_rate": 2.9298940549128962e-08, - "loss": 0.0132, - "step": 1951 - }, - { - "epoch": 9.521951219512195, - "grad_norm": 0.9336621165275574, - "learning_rate": 2.871701045930708e-08, - "loss": 0.0019, - "step": 1952 - }, - { - "epoch": 9.526829268292683, - "grad_norm": 1.2587624788284302, - "learning_rate": 2.8140884195945184e-08, - "loss": 0.0024, - "step": 1953 - }, - { - "epoch": 9.53170731707317, - "grad_norm": 0.13109427690505981, - "learning_rate": 2.7570563112079564e-08, - "loss": 0.001, - "step": 1954 - }, - { - "epoch": 9.536585365853659, - "grad_norm": 0.2514895796775818, - "learning_rate": 2.700604854711353e-08, - "loss": 0.0013, - "step": 1955 - }, - { - "epoch": 9.541463414634146, - "grad_norm": 0.3432636857032776, - "learning_rate": 2.6447341826814077e-08, - "loss": 0.0005, - "step": 1956 - }, - { - "epoch": 9.546341463414635, - "grad_norm": 0.4550987780094147, - "learning_rate": 2.5894444263307728e-08, - "loss": 0.001, - "step": 1957 - }, - { - "epoch": 9.551219512195122, - "grad_norm": 1.2675397396087646, - "learning_rate": 2.5347357155078577e-08, - "loss": 0.0103, - "step": 1958 - }, - { - "epoch": 9.55609756097561, - "grad_norm": 0.1289552003145218, - "learning_rate": 2.4806081786964974e-08, - "loss": 0.0006, - "step": 1959 - }, - { - "epoch": 9.560975609756097, - "grad_norm": 0.6298596858978271, - "learning_rate": 2.4270619430156183e-08, - "loss": 0.0019, - "step": 1960 - }, - { - "epoch": 9.565853658536586, - "grad_norm": 0.538487434387207, - "learning_rate": 2.3740971342189056e-08, - "loss": 0.0027, - "step": 1961 - }, - { - "epoch": 9.570731707317073, - "grad_norm": 0.8478948473930359, - "learning_rate": 2.321713876694637e-08, - "loss": 0.0133, - "step": 1962 - }, - { - "epoch": 9.575609756097561, - "grad_norm": 1.0609294176101685, - "learning_rate": 2.269912293465293e-08, - "loss": 0.008, - "step": 1963 - }, - { - "epoch": 9.580487804878048, - "grad_norm": 0.634739100933075, - "learning_rate": 2.2186925061872532e-08, - "loss": 0.0025, - "step": 1964 - }, - { - "epoch": 9.585365853658537, - "grad_norm": 0.43630343675613403, - "learning_rate": 2.1680546351506016e-08, - "loss": 0.003, - "step": 1965 - }, - { - "epoch": 9.590243902439024, - "grad_norm": 0.3712899684906006, - "learning_rate": 2.117998799278709e-08, - "loss": 0.0036, - "step": 1966 - }, - { - "epoch": 9.595121951219513, - "grad_norm": 0.13679739832878113, - "learning_rate": 2.068525116128095e-08, - "loss": 0.0006, - "step": 1967 - }, - { - "epoch": 9.6, - "grad_norm": 1.8157588243484497, - "learning_rate": 2.0196337018880962e-08, - "loss": 0.0659, - "step": 1968 - }, - { - "epoch": 9.604878048780488, - "grad_norm": 0.07176486402750015, - "learning_rate": 1.9713246713805588e-08, - "loss": 0.0003, - "step": 1969 - }, - { - "epoch": 9.609756097560975, - "grad_norm": 0.33367958664894104, - "learning_rate": 1.9235981380595625e-08, - "loss": 0.0013, - "step": 1970 - }, - { - "epoch": 9.614634146341464, - "grad_norm": 0.08895006775856018, - "learning_rate": 1.876454214011253e-08, - "loss": 0.0004, - "step": 1971 - }, - { - "epoch": 9.61951219512195, - "grad_norm": 0.2062547653913498, - "learning_rate": 1.8298930099534817e-08, - "loss": 0.001, - "step": 1972 - }, - { - "epoch": 9.62439024390244, - "grad_norm": 0.1351477950811386, - "learning_rate": 1.783914635235584e-08, - "loss": 0.0006, - "step": 1973 - }, - { - "epoch": 9.629268292682926, - "grad_norm": 0.5446783304214478, - "learning_rate": 1.738519197838101e-08, - "loss": 0.0038, - "step": 1974 - }, - { - "epoch": 9.634146341463415, - "grad_norm": 0.12655134499073029, - "learning_rate": 1.6937068043725856e-08, - "loss": 0.0006, - "step": 1975 - }, - { - "epoch": 9.639024390243902, - "grad_norm": 0.7479956150054932, - "learning_rate": 1.6494775600812418e-08, - "loss": 0.0026, - "step": 1976 - }, - { - "epoch": 9.64390243902439, - "grad_norm": 0.39983221888542175, - "learning_rate": 1.6058315688367852e-08, - "loss": 0.003, - "step": 1977 - }, - { - "epoch": 9.648780487804878, - "grad_norm": 0.2727876305580139, - "learning_rate": 1.5627689331421946e-08, - "loss": 0.0015, - "step": 1978 - }, - { - "epoch": 9.653658536585366, - "grad_norm": 0.17525868117809296, - "learning_rate": 1.520289754130322e-08, - "loss": 0.001, - "step": 1979 - }, - { - "epoch": 9.658536585365853, - "grad_norm": 0.2446790337562561, - "learning_rate": 1.478394131563865e-08, - "loss": 0.0011, - "step": 1980 - }, - { - "epoch": 9.663414634146342, - "grad_norm": 0.37458178400993347, - "learning_rate": 1.4370821638350353e-08, - "loss": 0.0022, - "step": 1981 - }, - { - "epoch": 9.668292682926829, - "grad_norm": 0.1664375215768814, - "learning_rate": 1.396353947965251e-08, - "loss": 0.0006, - "step": 1982 - }, - { - "epoch": 9.673170731707318, - "grad_norm": 0.08668441325426102, - "learning_rate": 1.3562095796050279e-08, - "loss": 0.0003, - "step": 1983 - }, - { - "epoch": 9.678048780487805, - "grad_norm": 0.2897089719772339, - "learning_rate": 1.3166491530337555e-08, - "loss": 0.001, - "step": 1984 - }, - { - "epoch": 9.682926829268293, - "grad_norm": 0.21582652628421783, - "learning_rate": 1.2776727611593653e-08, - "loss": 0.0007, - "step": 1985 - }, - { - "epoch": 9.68780487804878, - "grad_norm": 0.3643123507499695, - "learning_rate": 1.2392804955181915e-08, - "loss": 0.002, - "step": 1986 - }, - { - "epoch": 9.692682926829269, - "grad_norm": 0.5870813131332397, - "learning_rate": 1.2014724462747763e-08, - "loss": 0.0016, - "step": 1987 - }, - { - "epoch": 9.697560975609756, - "grad_norm": 0.19344697892665863, - "learning_rate": 1.1642487022215931e-08, - "loss": 0.0008, - "step": 1988 - }, - { - "epoch": 9.702439024390245, - "grad_norm": 0.15417703986167908, - "learning_rate": 1.1276093507788798e-08, - "loss": 0.001, - "step": 1989 - }, - { - "epoch": 9.707317073170731, - "grad_norm": 0.2714616358280182, - "learning_rate": 1.0915544779944164e-08, - "loss": 0.0022, - "step": 1990 - }, - { - "epoch": 9.71219512195122, - "grad_norm": 0.14375440776348114, - "learning_rate": 1.0560841685433864e-08, - "loss": 0.0008, - "step": 1991 - }, - { - "epoch": 9.717073170731707, - "grad_norm": 0.19977939128875732, - "learning_rate": 1.021198505728016e-08, - "loss": 0.0011, - "step": 1992 - }, - { - "epoch": 9.721951219512196, - "grad_norm": 0.20787867903709412, - "learning_rate": 9.868975714775741e-09, - "loss": 0.0012, - "step": 1993 - }, - { - "epoch": 9.726829268292683, - "grad_norm": 0.3878643810749054, - "learning_rate": 9.531814463480394e-09, - "loss": 0.0008, - "step": 1994 - }, - { - "epoch": 9.731707317073171, - "grad_norm": 0.5140596032142639, - "learning_rate": 9.200502095220166e-09, - "loss": 0.0034, - "step": 1995 - }, - { - "epoch": 9.736585365853658, - "grad_norm": 0.2106190174818039, - "learning_rate": 8.875039388084317e-09, - "loss": 0.0008, - "step": 1996 - }, - { - "epoch": 9.741463414634147, - "grad_norm": 0.09516038745641708, - "learning_rate": 8.555427106424485e-09, - "loss": 0.0005, - "step": 1997 - }, - { - "epoch": 9.746341463414634, - "grad_norm": 1.439642310142517, - "learning_rate": 8.241666000852466e-09, - "loss": 0.0314, - "step": 1998 - }, - { - "epoch": 9.751219512195123, - "grad_norm": 0.10020413249731064, - "learning_rate": 7.933756808238823e-09, - "loss": 0.0004, - "step": 1999 - }, - { - "epoch": 9.75609756097561, - "grad_norm": 0.4296906888484955, - "learning_rate": 7.631700251710116e-09, - "loss": 0.0022, - "step": 2000 - }, - { - "epoch": 9.760975609756098, - "grad_norm": 0.4867343604564667, - "learning_rate": 7.335497040648898e-09, - "loss": 0.0024, - "step": 2001 - }, - { - "epoch": 9.765853658536585, - "grad_norm": 3.0838112831115723, - "learning_rate": 7.045147870690105e-09, - "loss": 0.0796, - "step": 2002 - }, - { - "epoch": 9.770731707317074, - "grad_norm": 0.26949402689933777, - "learning_rate": 6.760653423721619e-09, - "loss": 0.0012, - "step": 2003 - }, - { - "epoch": 9.77560975609756, - "grad_norm": 0.854682445526123, - "learning_rate": 6.4820143678800964e-09, - "loss": 0.0059, - "step": 2004 - }, - { - "epoch": 9.78048780487805, - "grad_norm": 0.06472957879304886, - "learning_rate": 6.209231357551526e-09, - "loss": 0.0003, - "step": 2005 - }, - { - "epoch": 9.785365853658536, - "grad_norm": 0.9941632747650146, - "learning_rate": 5.942305033369289e-09, - "loss": 0.0113, - "step": 2006 - }, - { - "epoch": 9.790243902439025, - "grad_norm": 0.08150490373373032, - "learning_rate": 5.681236022211378e-09, - "loss": 0.0003, - "step": 2007 - }, - { - "epoch": 9.795121951219512, - "grad_norm": 0.37303054332733154, - "learning_rate": 5.426024937200402e-09, - "loss": 0.0021, - "step": 2008 - }, - { - "epoch": 9.8, - "grad_norm": 0.12861268222332, - "learning_rate": 5.176672377701364e-09, - "loss": 0.0004, - "step": 2009 - }, - { - "epoch": 9.804878048780488, - "grad_norm": 0.13954521715641022, - "learning_rate": 4.933178929321103e-09, - "loss": 0.0006, - "step": 2010 - }, - { - "epoch": 9.809756097560976, - "grad_norm": 0.8102789521217346, - "learning_rate": 4.695545163905524e-09, - "loss": 0.0047, - "step": 2011 - }, - { - "epoch": 9.814634146341463, - "grad_norm": 0.8437443971633911, - "learning_rate": 4.463771639539038e-09, - "loss": 0.0013, - "step": 2012 - }, - { - "epoch": 9.819512195121952, - "grad_norm": 0.3098134994506836, - "learning_rate": 4.237858900543734e-09, - "loss": 0.0025, - "step": 2013 - }, - { - "epoch": 9.824390243902439, - "grad_norm": 0.7686973214149475, - "learning_rate": 4.017807477477154e-09, - "loss": 0.0045, - "step": 2014 - }, - { - "epoch": 9.829268292682928, - "grad_norm": 0.45219677686691284, - "learning_rate": 3.803617887132016e-09, - "loss": 0.0017, - "step": 2015 - }, - { - "epoch": 9.834146341463414, - "grad_norm": 0.529446542263031, - "learning_rate": 3.5952906325339988e-09, - "loss": 0.0043, - "step": 2016 - }, - { - "epoch": 9.839024390243903, - "grad_norm": 0.35920700430870056, - "learning_rate": 3.3928262029411794e-09, - "loss": 0.0025, - "step": 2017 - }, - { - "epoch": 9.84390243902439, - "grad_norm": 0.3075787127017975, - "learning_rate": 3.196225073842929e-09, - "loss": 0.0025, - "step": 2018 - }, - { - "epoch": 9.848780487804879, - "grad_norm": 0.1374140977859497, - "learning_rate": 3.005487706958243e-09, - "loss": 0.0005, - "step": 2019 - }, - { - "epoch": 9.853658536585366, - "grad_norm": 0.5697541236877441, - "learning_rate": 2.8206145502354678e-09, - "loss": 0.0026, - "step": 2020 - }, - { - "epoch": 9.858536585365854, - "grad_norm": 1.0206952095031738, - "learning_rate": 2.641606037850353e-09, - "loss": 0.0101, - "step": 2021 - }, - { - "epoch": 9.863414634146341, - "grad_norm": 0.29209089279174805, - "learning_rate": 2.468462590205778e-09, - "loss": 0.0022, - "step": 2022 - }, - { - "epoch": 9.86829268292683, - "grad_norm": 0.13821417093276978, - "learning_rate": 2.3011846139306404e-09, - "loss": 0.0006, - "step": 2023 - }, - { - "epoch": 9.873170731707317, - "grad_norm": 0.4531463086605072, - "learning_rate": 2.13977250187819e-09, - "loss": 0.0018, - "step": 2024 - }, - { - "epoch": 9.878048780487806, - "grad_norm": 0.4380701184272766, - "learning_rate": 1.9842266331260296e-09, - "loss": 0.0031, - "step": 2025 - }, - { - "epoch": 9.882926829268293, - "grad_norm": 0.33851730823516846, - "learning_rate": 1.834547372975004e-09, - "loss": 0.0013, - "step": 2026 - }, - { - "epoch": 9.887804878048781, - "grad_norm": 0.4231720566749573, - "learning_rate": 1.6907350729478133e-09, - "loss": 0.0024, - "step": 2027 - }, - { - "epoch": 9.892682926829268, - "grad_norm": 0.4602144658565521, - "learning_rate": 1.5527900707887344e-09, - "loss": 0.004, - "step": 2028 - }, - { - "epoch": 9.897560975609757, - "grad_norm": 0.9638814330101013, - "learning_rate": 1.4207126904625114e-09, - "loss": 0.0097, - "step": 2029 - }, - { - "epoch": 9.902439024390244, - "grad_norm": 0.1374921053647995, - "learning_rate": 1.2945032421540771e-09, - "loss": 0.0005, - "step": 2030 - }, - { - "epoch": 9.907317073170733, - "grad_norm": 0.45432549715042114, - "learning_rate": 1.1741620222671667e-09, - "loss": 0.0041, - "step": 2031 - }, - { - "epoch": 9.91219512195122, - "grad_norm": 0.1905360370874405, - "learning_rate": 1.0596893134240394e-09, - "loss": 0.0007, - "step": 2032 - }, - { - "epoch": 9.917073170731708, - "grad_norm": 0.41532090306282043, - "learning_rate": 9.51085384464645e-10, - "loss": 0.007, - "step": 2033 - }, - { - "epoch": 9.921951219512195, - "grad_norm": 0.5167479515075684, - "learning_rate": 8.48350490446348e-10, - "loss": 0.0018, - "step": 2034 - }, - { - "epoch": 9.926829268292684, - "grad_norm": 0.4736052453517914, - "learning_rate": 7.514848726422608e-10, - "loss": 0.0028, - "step": 2035 - }, - { - "epoch": 9.93170731707317, - "grad_norm": 0.19710786640644073, - "learning_rate": 6.604887585426323e-10, - "loss": 0.0009, - "step": 2036 - }, - { - "epoch": 9.93658536585366, - "grad_norm": 0.4432890713214874, - "learning_rate": 5.753623618520721e-10, - "loss": 0.0049, - "step": 2037 - }, - { - "epoch": 9.941463414634146, - "grad_norm": 1.073500394821167, - "learning_rate": 4.961058824909382e-10, - "loss": 0.0044, - "step": 2038 - }, - { - "epoch": 9.946341463414633, - "grad_norm": 0.17539291083812714, - "learning_rate": 4.2271950659311665e-10, - "loss": 0.0005, - "step": 2039 - }, - { - "epoch": 9.951219512195122, - "grad_norm": 0.3674123287200928, - "learning_rate": 3.5520340650768705e-10, - "loss": 0.0012, - "step": 2040 - }, - { - "epoch": 9.95609756097561, - "grad_norm": 0.20704206824302673, - "learning_rate": 2.9355774079614653e-10, - "loss": 0.0007, - "step": 2041 - }, - { - "epoch": 9.960975609756098, - "grad_norm": 0.07566344738006592, - "learning_rate": 2.377826542343531e-10, - "loss": 0.0003, - "step": 2042 - }, - { - "epoch": 9.965853658536584, - "grad_norm": 0.1342095583677292, - "learning_rate": 1.8787827781002743e-10, - "loss": 0.0005, - "step": 2043 - }, - { - "epoch": 9.970731707317073, - "grad_norm": 0.7139898538589478, - "learning_rate": 1.4384472872414067e-10, - "loss": 0.0013, - "step": 2044 - }, - { - "epoch": 9.975609756097562, - "grad_norm": 0.11133516579866409, - "learning_rate": 1.056821103900818e-10, - "loss": 0.0006, - "step": 2045 - }, - { - "epoch": 9.980487804878049, - "grad_norm": 0.16971242427825928, - "learning_rate": 7.339051243254735e-11, - "loss": 0.0011, - "step": 2046 - }, - { - "epoch": 9.985365853658536, - "grad_norm": 1.3622301816940308, - "learning_rate": 4.697001068892926e-11, - "loss": 0.0202, - "step": 2047 - }, - { - "epoch": 9.990243902439024, - "grad_norm": 0.10895299166440964, - "learning_rate": 2.642066720792702e-11, - "loss": 0.0005, - "step": 2048 - }, - { - "epoch": 9.995121951219513, - "grad_norm": 0.32567188143730164, - "learning_rate": 1.1742530249547745e-11, - "loss": 0.0017, - "step": 2049 - }, - { - "epoch": 10.0, - "grad_norm": 0.06682642549276352, - "learning_rate": 2.9356342859387933e-12, - "loss": 0.0002, - "step": 2050 - }, - { - "epoch": 10.0, - "step": 2050, - "total_flos": 5.892331269877924e+17, - "train_loss": 0.2632100873960966, - "train_runtime": 9760.8256, - "train_samples_per_second": 0.837, - "train_steps_per_second": 0.21 - } - ], - "logging_steps": 1, - "max_steps": 2050, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 5.892331269877924e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo/training_loss.png b/metallama3_8b/limo/training_loss.png deleted file mode 100644 index a3d084d84f268c0ea53c2724982c18ee139499a9..0000000000000000000000000000000000000000 Binary files a/metallama3_8b/limo/training_loss.png and /dev/null differ diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/chat_template.jinja b/metallama3_8b/limo_filtered_combined/checkpoint-187/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/config.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/generation_config.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00001-of-00007.safetensors deleted file mode 100644 index 90637aa8e62c7e12d72a55891760755077e37956..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6dbd26e66ccdebadd434dc7bf664c1555680ca7664c2c13e52e29d2427ce34f1 -size 4886466168 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00002-of-00007.safetensors deleted file mode 100644 index fb53c6ecc340c89289c3df296486c87a74c4aa43..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e823f457d52c520a0f90a9c25f9891d4188730bc6c6f56998fb029af4e6c0a32 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00003-of-00007.safetensors deleted file mode 100644 index b7f3c58e2bea0c0b9c7fcadcd07b76a774e55e05..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a424192c007998472dae6b2d741b27c6a41bfe89057cdc77918b7c0b5ab2ef34 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00004-of-00007.safetensors deleted file mode 100644 index 8ec59d03054bdb75840ca56dd56674925b999b8a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db17b4d9b30e94ce2094d033177b8f79e2b5da172c3e433f537ea10f60391e51 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00005-of-00007.safetensors deleted file mode 100644 index f9b96c45250d3fc755c7f8905d8ecdced5fae897..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d5ada34d9a8d39e6d19c9ebd827caf51b216c695b28e00d5f3b8424ea2d273e -size 4832007496 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00006-of-00007.safetensors deleted file mode 100644 index ce964b4436fc6267bbb3b609627fd973d509f881..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1081f33d3192a2dd6239baec982c40e1dc83b1524c44ac03f5c9817ce07a21d5 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00007-of-00007.safetensors deleted file mode 100644 index c57f3efa6aa2a3cdbf3216157ecedfaf5443c7b3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ddf9a675ef729c1b83d68ccea26db8bb5ba41ee246b3f66490884eecd7063c7c -size 2571158184 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/model.safetensors.index.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_0.pth b/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_0.pth deleted file mode 100644 index 9c287de26f76b389db025ad109f0595b0b77fd22..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92cc13315f24c28015d695b6cde08bb1cd6fea4cbc435998485ed6fbe4c91285 -size 15024 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_1.pth b/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_1.pth deleted file mode 100644 index 132db267a0f5617620f48bc8eab9cc37a9aea13a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4c154b6a63e0b1f98f7d2847944398f99f1657d35e8eddf7fdf0ae2c24b0552 -size 15024 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_2.pth b/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_2.pth deleted file mode 100644 index e85bf2eceab47cefd59df592648941c61c84eab1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f784c6a9507b51189f2caffbd178ea9882103b75852e31c15f47fdae6a43af1d -size 15024 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_3.pth b/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_3.pth deleted file mode 100644 index 423bb6c008eeb6875c659dd108c5f003758dbcb9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34b023e05bc2d12b91dc436d4922b990d50ec8dc56d40dc3e36b3bb34fc81341 -size 15024 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/scheduler.pt b/metallama3_8b/limo_filtered_combined/checkpoint-187/scheduler.pt deleted file mode 100644 index 1c637a9a83a2f74786eed2b0cf163e428e1ea8fc..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6554a2a3c4103128dc9190b610a098c61c6e6e77e40cb297a9c00ff51382ca8f -size 1064 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/special_tokens_map.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer_config.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-187/trainer_state.json b/metallama3_8b/limo_filtered_combined/checkpoint-187/trainer_state.json deleted file mode 100644 index 8121dc73b9785dee6317a843d22bea2f42917744..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-187/trainer_state.json +++ /dev/null @@ -1,1343 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 187, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0053475935828877, - "grad_norm": 5.576223850250244, - "learning_rate": 5e-06, - "loss": 0.9394, - "step": 1 - }, - { - "epoch": 0.0106951871657754, - "grad_norm": 41.748443603515625, - "learning_rate": 4.99999647201733e-06, - "loss": 2.0122, - "step": 2 - }, - { - "epoch": 0.016042780748663103, - "grad_norm": 10.106061935424805, - "learning_rate": 4.999985888079276e-06, - "loss": 1.0092, - "step": 3 - }, - { - "epoch": 0.0213903743315508, - "grad_norm": 12.377921104431152, - "learning_rate": 4.999968248215712e-06, - "loss": 1.5196, - "step": 4 - }, - { - "epoch": 0.026737967914438502, - "grad_norm": 7.258418560028076, - "learning_rate": 4.999943552476422e-06, - "loss": 1.4586, - "step": 5 - }, - { - "epoch": 0.03208556149732621, - "grad_norm": 5.282329559326172, - "learning_rate": 4.999911800931108e-06, - "loss": 1.1068, - "step": 6 - }, - { - "epoch": 0.0374331550802139, - "grad_norm": 3.468794822692871, - "learning_rate": 4.999872993669387e-06, - "loss": 0.8997, - "step": 7 - }, - { - "epoch": 0.0427807486631016, - "grad_norm": 3.2200160026550293, - "learning_rate": 4.999827130800785e-06, - "loss": 1.075, - "step": 8 - }, - { - "epoch": 0.0481283422459893, - "grad_norm": 21.746450424194336, - "learning_rate": 4.999774212454746e-06, - "loss": 1.691, - "step": 9 - }, - { - "epoch": 0.053475935828877004, - "grad_norm": 11.011313438415527, - "learning_rate": 4.999714238780626e-06, - "loss": 1.3167, - "step": 10 - }, - { - "epoch": 0.058823529411764705, - "grad_norm": 5.002156734466553, - "learning_rate": 4.999647209947694e-06, - "loss": 0.9653, - "step": 11 - }, - { - "epoch": 0.06417112299465241, - "grad_norm": 12.305068016052246, - "learning_rate": 4.999573126145132e-06, - "loss": 1.2992, - "step": 12 - }, - { - "epoch": 0.06951871657754011, - "grad_norm": 5.660033702850342, - "learning_rate": 4.999491987582032e-06, - "loss": 0.9204, - "step": 13 - }, - { - "epoch": 0.0748663101604278, - "grad_norm": 5.366727828979492, - "learning_rate": 4.999403794487399e-06, - "loss": 1.3307, - "step": 14 - }, - { - "epoch": 0.08021390374331551, - "grad_norm": 3.265700578689575, - "learning_rate": 4.999308547110147e-06, - "loss": 0.8596, - "step": 15 - }, - { - "epoch": 0.0855614973262032, - "grad_norm": 3.0776281356811523, - "learning_rate": 4.9992062457191005e-06, - "loss": 0.9614, - "step": 16 - }, - { - "epoch": 0.09090909090909091, - "grad_norm": 2.8679943084716797, - "learning_rate": 4.999096890602996e-06, - "loss": 0.8, - "step": 17 - }, - { - "epoch": 0.0962566844919786, - "grad_norm": 2.785064935684204, - "learning_rate": 4.998980482070473e-06, - "loss": 0.7683, - "step": 18 - }, - { - "epoch": 0.10160427807486631, - "grad_norm": 4.619974613189697, - "learning_rate": 4.998857020450084e-06, - "loss": 1.2742, - "step": 19 - }, - { - "epoch": 0.10695187165775401, - "grad_norm": 2.894366979598999, - "learning_rate": 4.998726506090283e-06, - "loss": 0.8559, - "step": 20 - }, - { - "epoch": 0.11229946524064172, - "grad_norm": 3.4240734577178955, - "learning_rate": 4.998588939359435e-06, - "loss": 0.8223, - "step": 21 - }, - { - "epoch": 0.11764705882352941, - "grad_norm": 4.5151777267456055, - "learning_rate": 4.998444320645803e-06, - "loss": 1.1229, - "step": 22 - }, - { - "epoch": 0.12299465240641712, - "grad_norm": 2.7780518531799316, - "learning_rate": 4.998292650357558e-06, - "loss": 0.8936, - "step": 23 - }, - { - "epoch": 0.12834224598930483, - "grad_norm": 3.7252511978149414, - "learning_rate": 4.998133928922773e-06, - "loss": 1.2552, - "step": 24 - }, - { - "epoch": 0.13368983957219252, - "grad_norm": 4.296158313751221, - "learning_rate": 4.99796815678942e-06, - "loss": 0.7075, - "step": 25 - }, - { - "epoch": 0.13903743315508021, - "grad_norm": 5.5546956062316895, - "learning_rate": 4.997795334425372e-06, - "loss": 0.9781, - "step": 26 - }, - { - "epoch": 0.1443850267379679, - "grad_norm": 3.685818910598755, - "learning_rate": 4.997615462318403e-06, - "loss": 1.0657, - "step": 27 - }, - { - "epoch": 0.1497326203208556, - "grad_norm": 3.5500221252441406, - "learning_rate": 4.997428540976177e-06, - "loss": 0.951, - "step": 28 - }, - { - "epoch": 0.15508021390374332, - "grad_norm": 13.312395095825195, - "learning_rate": 4.997234570926263e-06, - "loss": 0.6788, - "step": 29 - }, - { - "epoch": 0.16042780748663102, - "grad_norm": 2.6344847679138184, - "learning_rate": 4.997033552716116e-06, - "loss": 0.8, - "step": 30 - }, - { - "epoch": 0.1657754010695187, - "grad_norm": 3.0757298469543457, - "learning_rate": 4.9968254869130885e-06, - "loss": 0.7625, - "step": 31 - }, - { - "epoch": 0.1711229946524064, - "grad_norm": 4.064891815185547, - "learning_rate": 4.996610374104422e-06, - "loss": 0.7381, - "step": 32 - }, - { - "epoch": 0.17647058823529413, - "grad_norm": 7.529796123504639, - "learning_rate": 4.9963882148972475e-06, - "loss": 1.3283, - "step": 33 - }, - { - "epoch": 0.18181818181818182, - "grad_norm": 3.2115354537963867, - "learning_rate": 4.996159009918586e-06, - "loss": 1.0002, - "step": 34 - }, - { - "epoch": 0.18716577540106952, - "grad_norm": 4.122320652008057, - "learning_rate": 4.9959227598153395e-06, - "loss": 0.9095, - "step": 35 - }, - { - "epoch": 0.1925133689839572, - "grad_norm": 54.98562240600586, - "learning_rate": 4.9956794652542994e-06, - "loss": 1.2191, - "step": 36 - }, - { - "epoch": 0.19786096256684493, - "grad_norm": 3.083123207092285, - "learning_rate": 4.9954291269221364e-06, - "loss": 0.7424, - "step": 37 - }, - { - "epoch": 0.20320855614973263, - "grad_norm": 15.99591064453125, - "learning_rate": 4.995171745525401e-06, - "loss": 0.9289, - "step": 38 - }, - { - "epoch": 0.20855614973262032, - "grad_norm": 5.214310169219971, - "learning_rate": 4.994907321790524e-06, - "loss": 0.991, - "step": 39 - }, - { - "epoch": 0.21390374331550802, - "grad_norm": 3.4376749992370605, - "learning_rate": 4.994635856463811e-06, - "loss": 0.6406, - "step": 40 - }, - { - "epoch": 0.2192513368983957, - "grad_norm": 4.30764102935791, - "learning_rate": 4.994357350311441e-06, - "loss": 1.2038, - "step": 41 - }, - { - "epoch": 0.22459893048128343, - "grad_norm": 3.5810039043426514, - "learning_rate": 4.994071804119467e-06, - "loss": 0.9696, - "step": 42 - }, - { - "epoch": 0.22994652406417113, - "grad_norm": 4.080881595611572, - "learning_rate": 4.993779218693811e-06, - "loss": 1.1579, - "step": 43 - }, - { - "epoch": 0.23529411764705882, - "grad_norm": 3.1389286518096924, - "learning_rate": 4.99347959486026e-06, - "loss": 0.7118, - "step": 44 - }, - { - "epoch": 0.24064171122994651, - "grad_norm": 2.6397321224212646, - "learning_rate": 4.99317293346447e-06, - "loss": 0.7579, - "step": 45 - }, - { - "epoch": 0.24598930481283424, - "grad_norm": 2.9469995498657227, - "learning_rate": 4.992859235371958e-06, - "loss": 0.7105, - "step": 46 - }, - { - "epoch": 0.25133689839572193, - "grad_norm": 2.669086456298828, - "learning_rate": 4.992538501468101e-06, - "loss": 0.6812, - "step": 47 - }, - { - "epoch": 0.25668449197860965, - "grad_norm": 5.421566009521484, - "learning_rate": 4.992210732658132e-06, - "loss": 0.9733, - "step": 48 - }, - { - "epoch": 0.2620320855614973, - "grad_norm": 4.413289546966553, - "learning_rate": 4.991875929867143e-06, - "loss": 1.1301, - "step": 49 - }, - { - "epoch": 0.26737967914438504, - "grad_norm": 3.1602351665496826, - "learning_rate": 4.991534094040077e-06, - "loss": 0.6706, - "step": 50 - }, - { - "epoch": 0.2727272727272727, - "grad_norm": 4.374372959136963, - "learning_rate": 4.991185226141726e-06, - "loss": 0.9462, - "step": 51 - }, - { - "epoch": 0.27807486631016043, - "grad_norm": 2.9649057388305664, - "learning_rate": 4.990829327156729e-06, - "loss": 1.0714, - "step": 52 - }, - { - "epoch": 0.28342245989304815, - "grad_norm": 3.1991283893585205, - "learning_rate": 4.990466398089571e-06, - "loss": 0.9175, - "step": 53 - }, - { - "epoch": 0.2887700534759358, - "grad_norm": 2.580082654953003, - "learning_rate": 4.99009643996458e-06, - "loss": 0.5164, - "step": 54 - }, - { - "epoch": 0.29411764705882354, - "grad_norm": 3.9115707874298096, - "learning_rate": 4.989719453825918e-06, - "loss": 0.7223, - "step": 55 - }, - { - "epoch": 0.2994652406417112, - "grad_norm": 2.825481414794922, - "learning_rate": 4.989335440737587e-06, - "loss": 0.7065, - "step": 56 - }, - { - "epoch": 0.3048128342245989, - "grad_norm": 2.8599696159362793, - "learning_rate": 4.9889444017834185e-06, - "loss": 0.8833, - "step": 57 - }, - { - "epoch": 0.31016042780748665, - "grad_norm": 2.885662078857422, - "learning_rate": 4.988546338067078e-06, - "loss": 0.8664, - "step": 58 - }, - { - "epoch": 0.3155080213903743, - "grad_norm": 3.187185764312744, - "learning_rate": 4.988141250712053e-06, - "loss": 0.884, - "step": 59 - }, - { - "epoch": 0.32085561497326204, - "grad_norm": 3.7545692920684814, - "learning_rate": 4.987729140861657e-06, - "loss": 0.9891, - "step": 60 - }, - { - "epoch": 0.32620320855614976, - "grad_norm": 3.0581002235412598, - "learning_rate": 4.987310009679023e-06, - "loss": 0.8838, - "step": 61 - }, - { - "epoch": 0.3315508021390374, - "grad_norm": 2.8039402961730957, - "learning_rate": 4.986883858347101e-06, - "loss": 0.8188, - "step": 62 - }, - { - "epoch": 0.33689839572192515, - "grad_norm": 3.01231050491333, - "learning_rate": 4.986450688068655e-06, - "loss": 0.6032, - "step": 63 - }, - { - "epoch": 0.3422459893048128, - "grad_norm": 2.7969677448272705, - "learning_rate": 4.986010500066258e-06, - "loss": 0.7623, - "step": 64 - }, - { - "epoch": 0.34759358288770054, - "grad_norm": 2.757786989212036, - "learning_rate": 4.985563295582292e-06, - "loss": 0.8051, - "step": 65 - }, - { - "epoch": 0.35294117647058826, - "grad_norm": 2.9582571983337402, - "learning_rate": 4.98510907587894e-06, - "loss": 0.7901, - "step": 66 - }, - { - "epoch": 0.3582887700534759, - "grad_norm": 3.104294776916504, - "learning_rate": 4.984647842238185e-06, - "loss": 1.0582, - "step": 67 - }, - { - "epoch": 0.36363636363636365, - "grad_norm": 2.7413785457611084, - "learning_rate": 4.984179595961806e-06, - "loss": 0.5912, - "step": 68 - }, - { - "epoch": 0.3689839572192513, - "grad_norm": 2.722858190536499, - "learning_rate": 4.983704338371375e-06, - "loss": 0.7855, - "step": 69 - }, - { - "epoch": 0.37433155080213903, - "grad_norm": 2.5095815658569336, - "learning_rate": 4.983222070808255e-06, - "loss": 0.6491, - "step": 70 - }, - { - "epoch": 0.37967914438502676, - "grad_norm": 2.97511887550354, - "learning_rate": 4.982732794633588e-06, - "loss": 0.9735, - "step": 71 - }, - { - "epoch": 0.3850267379679144, - "grad_norm": 3.5139546394348145, - "learning_rate": 4.982236511228301e-06, - "loss": 0.8495, - "step": 72 - }, - { - "epoch": 0.39037433155080214, - "grad_norm": 3.086568593978882, - "learning_rate": 4.981733221993099e-06, - "loss": 1.0891, - "step": 73 - }, - { - "epoch": 0.39572192513368987, - "grad_norm": 3.490666389465332, - "learning_rate": 4.981222928348456e-06, - "loss": 0.8013, - "step": 74 - }, - { - "epoch": 0.40106951871657753, - "grad_norm": 3.3275415897369385, - "learning_rate": 4.98070563173462e-06, - "loss": 0.8298, - "step": 75 - }, - { - "epoch": 0.40641711229946526, - "grad_norm": 2.7193403244018555, - "learning_rate": 4.980181333611601e-06, - "loss": 0.6989, - "step": 76 - }, - { - "epoch": 0.4117647058823529, - "grad_norm": 2.5338242053985596, - "learning_rate": 4.979650035459171e-06, - "loss": 0.6769, - "step": 77 - }, - { - "epoch": 0.41711229946524064, - "grad_norm": 2.7369015216827393, - "learning_rate": 4.9791117387768575e-06, - "loss": 1.0385, - "step": 78 - }, - { - "epoch": 0.42245989304812837, - "grad_norm": 2.6109988689422607, - "learning_rate": 4.978566445083942e-06, - "loss": 0.6498, - "step": 79 - }, - { - "epoch": 0.42780748663101603, - "grad_norm": 3.0895354747772217, - "learning_rate": 4.978014155919455e-06, - "loss": 0.7931, - "step": 80 - }, - { - "epoch": 0.43315508021390375, - "grad_norm": 2.6197807788848877, - "learning_rate": 4.977454872842169e-06, - "loss": 0.7322, - "step": 81 - }, - { - "epoch": 0.4385026737967914, - "grad_norm": 2.9248461723327637, - "learning_rate": 4.976888597430597e-06, - "loss": 0.9184, - "step": 82 - }, - { - "epoch": 0.44385026737967914, - "grad_norm": 2.7636630535125732, - "learning_rate": 4.976315331282985e-06, - "loss": 0.8258, - "step": 83 - }, - { - "epoch": 0.44919786096256686, - "grad_norm": 2.702061653137207, - "learning_rate": 4.9757350760173144e-06, - "loss": 0.7414, - "step": 84 - }, - { - "epoch": 0.45454545454545453, - "grad_norm": 2.3016257286071777, - "learning_rate": 4.975147833271288e-06, - "loss": 0.8573, - "step": 85 - }, - { - "epoch": 0.45989304812834225, - "grad_norm": 2.758795738220215, - "learning_rate": 4.974553604702332e-06, - "loss": 0.7271, - "step": 86 - }, - { - "epoch": 0.46524064171123, - "grad_norm": 3.0134952068328857, - "learning_rate": 4.973952391987589e-06, - "loss": 0.8976, - "step": 87 - }, - { - "epoch": 0.47058823529411764, - "grad_norm": 2.668630838394165, - "learning_rate": 4.9733441968239125e-06, - "loss": 1.0753, - "step": 88 - }, - { - "epoch": 0.47593582887700536, - "grad_norm": 2.5940303802490234, - "learning_rate": 4.972729020927866e-06, - "loss": 0.6903, - "step": 89 - }, - { - "epoch": 0.48128342245989303, - "grad_norm": 3.0423827171325684, - "learning_rate": 4.97210686603571e-06, - "loss": 0.9347, - "step": 90 - }, - { - "epoch": 0.48663101604278075, - "grad_norm": 2.5026450157165527, - "learning_rate": 4.97147773390341e-06, - "loss": 0.6738, - "step": 91 - }, - { - "epoch": 0.4919786096256685, - "grad_norm": 3.596545457839966, - "learning_rate": 4.970841626306617e-06, - "loss": 0.8356, - "step": 92 - }, - { - "epoch": 0.49732620320855614, - "grad_norm": 3.2207071781158447, - "learning_rate": 4.970198545040673e-06, - "loss": 0.9117, - "step": 93 - }, - { - "epoch": 0.5026737967914439, - "grad_norm": 2.858541965484619, - "learning_rate": 4.969548491920603e-06, - "loss": 0.8237, - "step": 94 - }, - { - "epoch": 0.5080213903743316, - "grad_norm": 2.896359920501709, - "learning_rate": 4.968891468781105e-06, - "loss": 0.8775, - "step": 95 - }, - { - "epoch": 0.5133689839572193, - "grad_norm": 3.6659083366394043, - "learning_rate": 4.968227477476554e-06, - "loss": 0.9068, - "step": 96 - }, - { - "epoch": 0.5187165775401069, - "grad_norm": 3.6469972133636475, - "learning_rate": 4.9675565198809905e-06, - "loss": 1.0435, - "step": 97 - }, - { - "epoch": 0.5240641711229946, - "grad_norm": 4.615362167358398, - "learning_rate": 4.966878597888114e-06, - "loss": 1.0084, - "step": 98 - }, - { - "epoch": 0.5294117647058824, - "grad_norm": 3.4075334072113037, - "learning_rate": 4.966193713411284e-06, - "loss": 0.7217, - "step": 99 - }, - { - "epoch": 0.5347593582887701, - "grad_norm": 2.8489456176757812, - "learning_rate": 4.965501868383507e-06, - "loss": 0.6594, - "step": 100 - }, - { - "epoch": 0.5401069518716578, - "grad_norm": 4.086977958679199, - "learning_rate": 4.964803064757438e-06, - "loss": 0.9249, - "step": 101 - }, - { - "epoch": 0.5454545454545454, - "grad_norm": 2.676903247833252, - "learning_rate": 4.964097304505371e-06, - "loss": 0.7776, - "step": 102 - }, - { - "epoch": 0.5508021390374331, - "grad_norm": 2.5098068714141846, - "learning_rate": 4.963384589619233e-06, - "loss": 0.6339, - "step": 103 - }, - { - "epoch": 0.5561497326203209, - "grad_norm": 4.064920902252197, - "learning_rate": 4.962664922110581e-06, - "loss": 1.0107, - "step": 104 - }, - { - "epoch": 0.5614973262032086, - "grad_norm": 2.6229960918426514, - "learning_rate": 4.9619383040105954e-06, - "loss": 1.0052, - "step": 105 - }, - { - "epoch": 0.5668449197860963, - "grad_norm": 2.857506275177002, - "learning_rate": 4.961204737370071e-06, - "loss": 0.8577, - "step": 106 - }, - { - "epoch": 0.5721925133689839, - "grad_norm": 3.9176764488220215, - "learning_rate": 4.960464224259418e-06, - "loss": 1.1237, - "step": 107 - }, - { - "epoch": 0.5775401069518716, - "grad_norm": 2.9063003063201904, - "learning_rate": 4.95971676676865e-06, - "loss": 0.6237, - "step": 108 - }, - { - "epoch": 0.5828877005347594, - "grad_norm": 3.1583969593048096, - "learning_rate": 4.958962367007381e-06, - "loss": 0.9135, - "step": 109 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.7559218406677246, - "learning_rate": 4.958201027104818e-06, - "loss": 0.7461, - "step": 110 - }, - { - "epoch": 0.5935828877005348, - "grad_norm": 11.086910247802734, - "learning_rate": 4.957432749209755e-06, - "loss": 0.69, - "step": 111 - }, - { - "epoch": 0.5989304812834224, - "grad_norm": 3.8109939098358154, - "learning_rate": 4.95665753549057e-06, - "loss": 0.8578, - "step": 112 - }, - { - "epoch": 0.6042780748663101, - "grad_norm": 3.3317348957061768, - "learning_rate": 4.9558753881352165e-06, - "loss": 1.3098, - "step": 113 - }, - { - "epoch": 0.6096256684491979, - "grad_norm": 2.715823173522949, - "learning_rate": 4.955086309351213e-06, - "loss": 0.9979, - "step": 114 - }, - { - "epoch": 0.6149732620320856, - "grad_norm": 2.798602819442749, - "learning_rate": 4.9542903013656485e-06, - "loss": 0.6298, - "step": 115 - }, - { - "epoch": 0.6203208556149733, - "grad_norm": 32.90562438964844, - "learning_rate": 4.953487366425163e-06, - "loss": 0.959, - "step": 116 - }, - { - "epoch": 0.6256684491978609, - "grad_norm": 4.012441158294678, - "learning_rate": 4.952677506795949e-06, - "loss": 0.6791, - "step": 117 - }, - { - "epoch": 0.6310160427807486, - "grad_norm": 3.548151731491089, - "learning_rate": 4.951860724763743e-06, - "loss": 0.7783, - "step": 118 - }, - { - "epoch": 0.6363636363636364, - "grad_norm": 3.4778249263763428, - "learning_rate": 4.95103702263382e-06, - "loss": 0.8085, - "step": 119 - }, - { - "epoch": 0.6417112299465241, - "grad_norm": 2.625532627105713, - "learning_rate": 4.950206402730984e-06, - "loss": 0.7702, - "step": 120 - }, - { - "epoch": 0.6470588235294118, - "grad_norm": 3.2743935585021973, - "learning_rate": 4.949368867399567e-06, - "loss": 0.602, - "step": 121 - }, - { - "epoch": 0.6524064171122995, - "grad_norm": 3.9576094150543213, - "learning_rate": 4.948524419003415e-06, - "loss": 1.2858, - "step": 122 - }, - { - "epoch": 0.6577540106951871, - "grad_norm": 3.233257532119751, - "learning_rate": 4.947673059925889e-06, - "loss": 0.7945, - "step": 123 - }, - { - "epoch": 0.6631016042780749, - "grad_norm": 2.6730406284332275, - "learning_rate": 4.9468147925698525e-06, - "loss": 0.959, - "step": 124 - }, - { - "epoch": 0.6684491978609626, - "grad_norm": 2.8612916469573975, - "learning_rate": 4.945949619357668e-06, - "loss": 0.7611, - "step": 125 - }, - { - "epoch": 0.6737967914438503, - "grad_norm": 2.9609551429748535, - "learning_rate": 4.945077542731188e-06, - "loss": 0.5753, - "step": 126 - }, - { - "epoch": 0.679144385026738, - "grad_norm": 3.7842485904693604, - "learning_rate": 4.94419856515175e-06, - "loss": 0.8995, - "step": 127 - }, - { - "epoch": 0.6844919786096256, - "grad_norm": 3.513170003890991, - "learning_rate": 4.943312689100166e-06, - "loss": 0.9623, - "step": 128 - }, - { - "epoch": 0.6898395721925134, - "grad_norm": 2.690305471420288, - "learning_rate": 4.942419917076723e-06, - "loss": 0.6657, - "step": 129 - }, - { - "epoch": 0.6951871657754011, - "grad_norm": 2.951237440109253, - "learning_rate": 4.941520251601167e-06, - "loss": 0.7711, - "step": 130 - }, - { - "epoch": 0.7005347593582888, - "grad_norm": 2.8285868167877197, - "learning_rate": 4.940613695212702e-06, - "loss": 0.5908, - "step": 131 - }, - { - "epoch": 0.7058823529411765, - "grad_norm": 2.6700541973114014, - "learning_rate": 4.939700250469979e-06, - "loss": 0.967, - "step": 132 - }, - { - "epoch": 0.7112299465240641, - "grad_norm": 3.229152202606201, - "learning_rate": 4.938779919951092e-06, - "loss": 0.9519, - "step": 133 - }, - { - "epoch": 0.7165775401069518, - "grad_norm": 2.403944730758667, - "learning_rate": 4.93785270625357e-06, - "loss": 0.5873, - "step": 134 - }, - { - "epoch": 0.7219251336898396, - "grad_norm": 3.8491666316986084, - "learning_rate": 4.936918611994368e-06, - "loss": 0.8148, - "step": 135 - }, - { - "epoch": 0.7272727272727273, - "grad_norm": 2.8255743980407715, - "learning_rate": 4.935977639809861e-06, - "loss": 0.8286, - "step": 136 - }, - { - "epoch": 0.732620320855615, - "grad_norm": 2.8479511737823486, - "learning_rate": 4.935029792355834e-06, - "loss": 0.6442, - "step": 137 - }, - { - "epoch": 0.7379679144385026, - "grad_norm": 2.585566759109497, - "learning_rate": 4.934075072307481e-06, - "loss": 1.0144, - "step": 138 - }, - { - "epoch": 0.7433155080213903, - "grad_norm": 2.8108413219451904, - "learning_rate": 4.933113482359388e-06, - "loss": 0.5922, - "step": 139 - }, - { - "epoch": 0.7486631016042781, - "grad_norm": 2.799546241760254, - "learning_rate": 4.932145025225535e-06, - "loss": 0.7546, - "step": 140 - }, - { - "epoch": 0.7540106951871658, - "grad_norm": 2.6492230892181396, - "learning_rate": 4.931169703639282e-06, - "loss": 0.8797, - "step": 141 - }, - { - "epoch": 0.7593582887700535, - "grad_norm": 4.130539417266846, - "learning_rate": 4.930187520353363e-06, - "loss": 0.865, - "step": 142 - }, - { - "epoch": 0.7647058823529411, - "grad_norm": 2.6537978649139404, - "learning_rate": 4.929198478139877e-06, - "loss": 0.6901, - "step": 143 - }, - { - "epoch": 0.7700534759358288, - "grad_norm": 2.488971710205078, - "learning_rate": 4.928202579790285e-06, - "loss": 0.5932, - "step": 144 - }, - { - "epoch": 0.7754010695187166, - "grad_norm": 2.4585540294647217, - "learning_rate": 4.927199828115395e-06, - "loss": 0.7742, - "step": 145 - }, - { - "epoch": 0.7807486631016043, - "grad_norm": 2.5525095462799072, - "learning_rate": 4.9261902259453616e-06, - "loss": 0.8475, - "step": 146 - }, - { - "epoch": 0.786096256684492, - "grad_norm": 3.032649040222168, - "learning_rate": 4.925173776129669e-06, - "loss": 1.0514, - "step": 147 - }, - { - "epoch": 0.7914438502673797, - "grad_norm": 2.4535398483276367, - "learning_rate": 4.9241504815371346e-06, - "loss": 0.5964, - "step": 148 - }, - { - "epoch": 0.7967914438502673, - "grad_norm": 2.2060890197753906, - "learning_rate": 4.923120345055887e-06, - "loss": 0.7615, - "step": 149 - }, - { - "epoch": 0.8021390374331551, - "grad_norm": 3.0113794803619385, - "learning_rate": 4.922083369593372e-06, - "loss": 0.6908, - "step": 150 - }, - { - "epoch": 0.8074866310160428, - "grad_norm": 2.6805336475372314, - "learning_rate": 4.921039558076335e-06, - "loss": 0.8661, - "step": 151 - }, - { - "epoch": 0.8128342245989305, - "grad_norm": 3.562213897705078, - "learning_rate": 4.919988913450812e-06, - "loss": 0.5267, - "step": 152 - }, - { - "epoch": 0.8181818181818182, - "grad_norm": 3.3453261852264404, - "learning_rate": 4.918931438682132e-06, - "loss": 0.9222, - "step": 153 - }, - { - "epoch": 0.8235294117647058, - "grad_norm": 2.7286977767944336, - "learning_rate": 4.917867136754894e-06, - "loss": 0.8865, - "step": 154 - }, - { - "epoch": 0.8288770053475936, - "grad_norm": 2.263981819152832, - "learning_rate": 4.916796010672969e-06, - "loss": 0.7262, - "step": 155 - }, - { - "epoch": 0.8342245989304813, - "grad_norm": 2.273568630218506, - "learning_rate": 4.91571806345949e-06, - "loss": 0.7611, - "step": 156 - }, - { - "epoch": 0.839572192513369, - "grad_norm": 3.0288827419281006, - "learning_rate": 4.91463329815684e-06, - "loss": 0.8745, - "step": 157 - }, - { - "epoch": 0.8449197860962567, - "grad_norm": 2.3675708770751953, - "learning_rate": 4.913541717826645e-06, - "loss": 0.6164, - "step": 158 - }, - { - "epoch": 0.8502673796791443, - "grad_norm": 2.2979559898376465, - "learning_rate": 4.912443325549767e-06, - "loss": 0.5549, - "step": 159 - }, - { - "epoch": 0.8556149732620321, - "grad_norm": 6.2421064376831055, - "learning_rate": 4.911338124426291e-06, - "loss": 0.9052, - "step": 160 - }, - { - "epoch": 0.8609625668449198, - "grad_norm": 2.125546932220459, - "learning_rate": 4.910226117575525e-06, - "loss": 0.7989, - "step": 161 - }, - { - "epoch": 0.8663101604278075, - "grad_norm": 2.8069941997528076, - "learning_rate": 4.909107308135978e-06, - "loss": 0.5915, - "step": 162 - }, - { - "epoch": 0.8716577540106952, - "grad_norm": 2.9329476356506348, - "learning_rate": 4.907981699265364e-06, - "loss": 0.6593, - "step": 163 - }, - { - "epoch": 0.8770053475935828, - "grad_norm": 3.8588013648986816, - "learning_rate": 4.906849294140587e-06, - "loss": 0.8739, - "step": 164 - }, - { - "epoch": 0.8823529411764706, - "grad_norm": 3.3252463340759277, - "learning_rate": 4.9057100959577285e-06, - "loss": 0.7314, - "step": 165 - }, - { - "epoch": 0.8877005347593583, - "grad_norm": 3.051591634750366, - "learning_rate": 4.904564107932048e-06, - "loss": 1.0109, - "step": 166 - }, - { - "epoch": 0.893048128342246, - "grad_norm": 2.8550548553466797, - "learning_rate": 4.903411333297966e-06, - "loss": 0.9092, - "step": 167 - }, - { - "epoch": 0.8983957219251337, - "grad_norm": 2.8500938415527344, - "learning_rate": 4.902251775309057e-06, - "loss": 0.7922, - "step": 168 - }, - { - "epoch": 0.9037433155080213, - "grad_norm": 3.3096566200256348, - "learning_rate": 4.901085437238041e-06, - "loss": 0.5955, - "step": 169 - }, - { - "epoch": 0.9090909090909091, - "grad_norm": 2.7365124225616455, - "learning_rate": 4.899912322376776e-06, - "loss": 1.0019, - "step": 170 - }, - { - "epoch": 0.9144385026737968, - "grad_norm": 2.3542861938476562, - "learning_rate": 4.8987324340362445e-06, - "loss": 0.8508, - "step": 171 - }, - { - "epoch": 0.9197860962566845, - "grad_norm": 2.822413921356201, - "learning_rate": 4.897545775546545e-06, - "loss": 0.8514, - "step": 172 - }, - { - "epoch": 0.9251336898395722, - "grad_norm": 2.528853416442871, - "learning_rate": 4.8963523502568886e-06, - "loss": 1.0263, - "step": 173 - }, - { - "epoch": 0.93048128342246, - "grad_norm": 3.7086899280548096, - "learning_rate": 4.895152161535582e-06, - "loss": 0.7929, - "step": 174 - }, - { - "epoch": 0.9358288770053476, - "grad_norm": 2.407613515853882, - "learning_rate": 4.893945212770019e-06, - "loss": 0.7227, - "step": 175 - }, - { - "epoch": 0.9411764705882353, - "grad_norm": 2.629978656768799, - "learning_rate": 4.892731507366678e-06, - "loss": 0.8923, - "step": 176 - }, - { - "epoch": 0.946524064171123, - "grad_norm": 2.281735897064209, - "learning_rate": 4.891511048751102e-06, - "loss": 0.7475, - "step": 177 - }, - { - "epoch": 0.9518716577540107, - "grad_norm": 2.8144044876098633, - "learning_rate": 4.890283840367898e-06, - "loss": 1.1405, - "step": 178 - }, - { - "epoch": 0.9572192513368984, - "grad_norm": 3.9945294857025146, - "learning_rate": 4.889049885680721e-06, - "loss": 0.8524, - "step": 179 - }, - { - "epoch": 0.9625668449197861, - "grad_norm": 2.9770278930664062, - "learning_rate": 4.887809188172268e-06, - "loss": 0.7617, - "step": 180 - }, - { - "epoch": 0.9679144385026738, - "grad_norm": 2.9451241493225098, - "learning_rate": 4.886561751344266e-06, - "loss": 0.8514, - "step": 181 - }, - { - "epoch": 0.9732620320855615, - "grad_norm": 2.670421600341797, - "learning_rate": 4.885307578717464e-06, - "loss": 0.8335, - "step": 182 - }, - { - "epoch": 0.9786096256684492, - "grad_norm": 2.565976858139038, - "learning_rate": 4.8840466738316216e-06, - "loss": 0.831, - "step": 183 - }, - { - "epoch": 0.983957219251337, - "grad_norm": 2.5326290130615234, - "learning_rate": 4.882779040245499e-06, - "loss": 0.7891, - "step": 184 - }, - { - "epoch": 0.9893048128342246, - "grad_norm": 2.524470090866089, - "learning_rate": 4.881504681536847e-06, - "loss": 0.6257, - "step": 185 - }, - { - "epoch": 0.9946524064171123, - "grad_norm": 2.3305137157440186, - "learning_rate": 4.880223601302398e-06, - "loss": 0.6008, - "step": 186 - }, - { - "epoch": 1.0, - "grad_norm": 3.0916237831115723, - "learning_rate": 4.878935803157856e-06, - "loss": 0.6061, - "step": 187 - } - ], - "logging_steps": 1, - "max_steps": 1870, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.976503998434509e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/chat_template.jinja b/metallama3_8b/limo_filtered_combined/checkpoint-374/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/config.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/generation_config.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00001-of-00007.safetensors deleted file mode 100644 index 8ac9569035cfcc8b0618c8d701a8d9c7bca93d9c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b88d99fa21c9685935c3af80da98131fd9a3ff840371aa5bb33f1d1139050cf -size 4886466168 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00002-of-00007.safetensors deleted file mode 100644 index 5f0c7edb770dfd6bb6f96c4575acb2fe502933de..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d8637c3fae3d0348cb453dd6264bf75db939434599a3377aa5142b48e76585e -size 4832007448 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00003-of-00007.safetensors deleted file mode 100644 index 641d1ad6cea9e4f2830082e64a39fdc8a8cf86d4..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27cba4f5f5b5149f11fb36b6146dd583fdfb69bf124dcd0f81e97c80c4873efb -size 4999813112 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00004-of-00007.safetensors deleted file mode 100644 index d23620f7db9cd1f9792d6f7a93893172d56da326..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b99ba1db48fa5c343a8a01a39a25d6cb4fd922ef2996460ed2c2e918427e8d0 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00005-of-00007.safetensors deleted file mode 100644 index 76f7a243965a58e607e2d8585e05efaa0cfdeff8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07a8a74943cbb77a104e67a379b061ecae72e4652ecc4fefafe9a549b4585ef7 -size 4832007496 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00006-of-00007.safetensors deleted file mode 100644 index c723db20176496351ae7399b6d940c6849ca2f5e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c2baced3100d4d909b506c951cbbeadb7654ae09e3dc6054b92d9f22038c070 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00007-of-00007.safetensors deleted file mode 100644 index 99b43f3a180f3763871168bacf1790061825c5a6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fcabab8bf5997f17db9acbbd223dc06ff6edf539aead5167002e1765bc48d782 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/model.safetensors.index.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_0.pth b/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_0.pth deleted file mode 100644 index 37ac50652a3badbfb1bdeaccb8b1934575b584eb..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad -size 15024 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_1.pth b/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_1.pth deleted file mode 100644 index 0bc3650851dae439677613c9e23a5528de47b679..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57 -size 15024 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_2.pth b/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_2.pth deleted file mode 100644 index 0e00a6e8b4b743026f68d749a8cb3bdd4b746838..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d -size 15024 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_3.pth b/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_3.pth deleted file mode 100644 index 5354141d42e077c356f9ca8c6b12bd7e5e41f2af..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039 -size 15024 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/scheduler.pt b/metallama3_8b/limo_filtered_combined/checkpoint-374/scheduler.pt deleted file mode 100644 index 1be77b92015462c4e663f9571d63a051efdf5beb..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:29056be56d1d8721b22d164dabdb010f02300ef4d4e2ae9184157352fd3d4076 -size 1064 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/special_tokens_map.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer_config.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_combined/checkpoint-374/trainer_state.json b/metallama3_8b/limo_filtered_combined/checkpoint-374/trainer_state.json deleted file mode 100644 index f6fe9f22d589fea1b113374cd4672ce50cc34304..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/checkpoint-374/trainer_state.json +++ /dev/null @@ -1,2652 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.0, - "eval_steps": 500, - "global_step": 374, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0053475935828877, - "grad_norm": 5.576223850250244, - "learning_rate": 5e-06, - "loss": 0.9394, - "step": 1 - }, - { - "epoch": 0.0106951871657754, - "grad_norm": 41.748443603515625, - "learning_rate": 4.99999647201733e-06, - "loss": 2.0122, - "step": 2 - }, - { - "epoch": 0.016042780748663103, - "grad_norm": 10.106061935424805, - "learning_rate": 4.999985888079276e-06, - "loss": 1.0092, - "step": 3 - }, - { - "epoch": 0.0213903743315508, - "grad_norm": 12.377921104431152, - "learning_rate": 4.999968248215712e-06, - "loss": 1.5196, - "step": 4 - }, - { - "epoch": 0.026737967914438502, - "grad_norm": 7.258418560028076, - "learning_rate": 4.999943552476422e-06, - "loss": 1.4586, - "step": 5 - }, - { - "epoch": 0.03208556149732621, - "grad_norm": 5.282329559326172, - "learning_rate": 4.999911800931108e-06, - "loss": 1.1068, - "step": 6 - }, - { - "epoch": 0.0374331550802139, - "grad_norm": 3.468794822692871, - "learning_rate": 4.999872993669387e-06, - "loss": 0.8997, - "step": 7 - }, - { - "epoch": 0.0427807486631016, - "grad_norm": 3.2200160026550293, - "learning_rate": 4.999827130800785e-06, - "loss": 1.075, - "step": 8 - }, - { - "epoch": 0.0481283422459893, - "grad_norm": 21.746450424194336, - "learning_rate": 4.999774212454746e-06, - "loss": 1.691, - "step": 9 - }, - { - "epoch": 0.053475935828877004, - "grad_norm": 11.011313438415527, - "learning_rate": 4.999714238780626e-06, - "loss": 1.3167, - "step": 10 - }, - { - "epoch": 0.058823529411764705, - "grad_norm": 5.002156734466553, - "learning_rate": 4.999647209947694e-06, - "loss": 0.9653, - "step": 11 - }, - { - "epoch": 0.06417112299465241, - "grad_norm": 12.305068016052246, - "learning_rate": 4.999573126145132e-06, - "loss": 1.2992, - "step": 12 - }, - { - "epoch": 0.06951871657754011, - "grad_norm": 5.660033702850342, - "learning_rate": 4.999491987582032e-06, - "loss": 0.9204, - "step": 13 - }, - { - "epoch": 0.0748663101604278, - "grad_norm": 5.366727828979492, - "learning_rate": 4.999403794487399e-06, - "loss": 1.3307, - "step": 14 - }, - { - "epoch": 0.08021390374331551, - "grad_norm": 3.265700578689575, - "learning_rate": 4.999308547110147e-06, - "loss": 0.8596, - "step": 15 - }, - { - "epoch": 0.0855614973262032, - "grad_norm": 3.0776281356811523, - "learning_rate": 4.9992062457191005e-06, - "loss": 0.9614, - "step": 16 - }, - { - "epoch": 0.09090909090909091, - "grad_norm": 2.8679943084716797, - "learning_rate": 4.999096890602996e-06, - "loss": 0.8, - "step": 17 - }, - { - "epoch": 0.0962566844919786, - "grad_norm": 2.785064935684204, - "learning_rate": 4.998980482070473e-06, - "loss": 0.7683, - "step": 18 - }, - { - "epoch": 0.10160427807486631, - "grad_norm": 4.619974613189697, - "learning_rate": 4.998857020450084e-06, - "loss": 1.2742, - "step": 19 - }, - { - "epoch": 0.10695187165775401, - "grad_norm": 2.894366979598999, - "learning_rate": 4.998726506090283e-06, - "loss": 0.8559, - "step": 20 - }, - { - "epoch": 0.11229946524064172, - "grad_norm": 3.4240734577178955, - "learning_rate": 4.998588939359435e-06, - "loss": 0.8223, - "step": 21 - }, - { - "epoch": 0.11764705882352941, - "grad_norm": 4.5151777267456055, - "learning_rate": 4.998444320645803e-06, - "loss": 1.1229, - "step": 22 - }, - { - "epoch": 0.12299465240641712, - "grad_norm": 2.7780518531799316, - "learning_rate": 4.998292650357558e-06, - "loss": 0.8936, - "step": 23 - }, - { - "epoch": 0.12834224598930483, - "grad_norm": 3.7252511978149414, - "learning_rate": 4.998133928922773e-06, - "loss": 1.2552, - "step": 24 - }, - { - "epoch": 0.13368983957219252, - "grad_norm": 4.296158313751221, - "learning_rate": 4.99796815678942e-06, - "loss": 0.7075, - "step": 25 - }, - { - "epoch": 0.13903743315508021, - "grad_norm": 5.5546956062316895, - "learning_rate": 4.997795334425372e-06, - "loss": 0.9781, - "step": 26 - }, - { - "epoch": 0.1443850267379679, - "grad_norm": 3.685818910598755, - "learning_rate": 4.997615462318403e-06, - "loss": 1.0657, - "step": 27 - }, - { - "epoch": 0.1497326203208556, - "grad_norm": 3.5500221252441406, - "learning_rate": 4.997428540976177e-06, - "loss": 0.951, - "step": 28 - }, - { - "epoch": 0.15508021390374332, - "grad_norm": 13.312395095825195, - "learning_rate": 4.997234570926263e-06, - "loss": 0.6788, - "step": 29 - }, - { - "epoch": 0.16042780748663102, - "grad_norm": 2.6344847679138184, - "learning_rate": 4.997033552716116e-06, - "loss": 0.8, - "step": 30 - }, - { - "epoch": 0.1657754010695187, - "grad_norm": 3.0757298469543457, - "learning_rate": 4.9968254869130885e-06, - "loss": 0.7625, - "step": 31 - }, - { - "epoch": 0.1711229946524064, - "grad_norm": 4.064891815185547, - "learning_rate": 4.996610374104422e-06, - "loss": 0.7381, - "step": 32 - }, - { - "epoch": 0.17647058823529413, - "grad_norm": 7.529796123504639, - "learning_rate": 4.9963882148972475e-06, - "loss": 1.3283, - "step": 33 - }, - { - "epoch": 0.18181818181818182, - "grad_norm": 3.2115354537963867, - "learning_rate": 4.996159009918586e-06, - "loss": 1.0002, - "step": 34 - }, - { - "epoch": 0.18716577540106952, - "grad_norm": 4.122320652008057, - "learning_rate": 4.9959227598153395e-06, - "loss": 0.9095, - "step": 35 - }, - { - "epoch": 0.1925133689839572, - "grad_norm": 54.98562240600586, - "learning_rate": 4.9956794652542994e-06, - "loss": 1.2191, - "step": 36 - }, - { - "epoch": 0.19786096256684493, - "grad_norm": 3.083123207092285, - "learning_rate": 4.9954291269221364e-06, - "loss": 0.7424, - "step": 37 - }, - { - "epoch": 0.20320855614973263, - "grad_norm": 15.99591064453125, - "learning_rate": 4.995171745525401e-06, - "loss": 0.9289, - "step": 38 - }, - { - "epoch": 0.20855614973262032, - "grad_norm": 5.214310169219971, - "learning_rate": 4.994907321790524e-06, - "loss": 0.991, - "step": 39 - }, - { - "epoch": 0.21390374331550802, - "grad_norm": 3.4376749992370605, - "learning_rate": 4.994635856463811e-06, - "loss": 0.6406, - "step": 40 - }, - { - "epoch": 0.2192513368983957, - "grad_norm": 4.30764102935791, - "learning_rate": 4.994357350311441e-06, - "loss": 1.2038, - "step": 41 - }, - { - "epoch": 0.22459893048128343, - "grad_norm": 3.5810039043426514, - "learning_rate": 4.994071804119467e-06, - "loss": 0.9696, - "step": 42 - }, - { - "epoch": 0.22994652406417113, - "grad_norm": 4.080881595611572, - "learning_rate": 4.993779218693811e-06, - "loss": 1.1579, - "step": 43 - }, - { - "epoch": 0.23529411764705882, - "grad_norm": 3.1389286518096924, - "learning_rate": 4.99347959486026e-06, - "loss": 0.7118, - "step": 44 - }, - { - "epoch": 0.24064171122994651, - "grad_norm": 2.6397321224212646, - "learning_rate": 4.99317293346447e-06, - "loss": 0.7579, - "step": 45 - }, - { - "epoch": 0.24598930481283424, - "grad_norm": 2.9469995498657227, - "learning_rate": 4.992859235371958e-06, - "loss": 0.7105, - "step": 46 - }, - { - "epoch": 0.25133689839572193, - "grad_norm": 2.669086456298828, - "learning_rate": 4.992538501468101e-06, - "loss": 0.6812, - "step": 47 - }, - { - "epoch": 0.25668449197860965, - "grad_norm": 5.421566009521484, - "learning_rate": 4.992210732658132e-06, - "loss": 0.9733, - "step": 48 - }, - { - "epoch": 0.2620320855614973, - "grad_norm": 4.413289546966553, - "learning_rate": 4.991875929867143e-06, - "loss": 1.1301, - "step": 49 - }, - { - "epoch": 0.26737967914438504, - "grad_norm": 3.1602351665496826, - "learning_rate": 4.991534094040077e-06, - "loss": 0.6706, - "step": 50 - }, - { - "epoch": 0.2727272727272727, - "grad_norm": 4.374372959136963, - "learning_rate": 4.991185226141726e-06, - "loss": 0.9462, - "step": 51 - }, - { - "epoch": 0.27807486631016043, - "grad_norm": 2.9649057388305664, - "learning_rate": 4.990829327156729e-06, - "loss": 1.0714, - "step": 52 - }, - { - "epoch": 0.28342245989304815, - "grad_norm": 3.1991283893585205, - "learning_rate": 4.990466398089571e-06, - "loss": 0.9175, - "step": 53 - }, - { - "epoch": 0.2887700534759358, - "grad_norm": 2.580082654953003, - "learning_rate": 4.99009643996458e-06, - "loss": 0.5164, - "step": 54 - }, - { - "epoch": 0.29411764705882354, - "grad_norm": 3.9115707874298096, - "learning_rate": 4.989719453825918e-06, - "loss": 0.7223, - "step": 55 - }, - { - "epoch": 0.2994652406417112, - "grad_norm": 2.825481414794922, - "learning_rate": 4.989335440737587e-06, - "loss": 0.7065, - "step": 56 - }, - { - "epoch": 0.3048128342245989, - "grad_norm": 2.8599696159362793, - "learning_rate": 4.9889444017834185e-06, - "loss": 0.8833, - "step": 57 - }, - { - "epoch": 0.31016042780748665, - "grad_norm": 2.885662078857422, - "learning_rate": 4.988546338067078e-06, - "loss": 0.8664, - "step": 58 - }, - { - "epoch": 0.3155080213903743, - "grad_norm": 3.187185764312744, - "learning_rate": 4.988141250712053e-06, - "loss": 0.884, - "step": 59 - }, - { - "epoch": 0.32085561497326204, - "grad_norm": 3.7545692920684814, - "learning_rate": 4.987729140861657e-06, - "loss": 0.9891, - "step": 60 - }, - { - "epoch": 0.32620320855614976, - "grad_norm": 3.0581002235412598, - "learning_rate": 4.987310009679023e-06, - "loss": 0.8838, - "step": 61 - }, - { - "epoch": 0.3315508021390374, - "grad_norm": 2.8039402961730957, - "learning_rate": 4.986883858347101e-06, - "loss": 0.8188, - "step": 62 - }, - { - "epoch": 0.33689839572192515, - "grad_norm": 3.01231050491333, - "learning_rate": 4.986450688068655e-06, - "loss": 0.6032, - "step": 63 - }, - { - "epoch": 0.3422459893048128, - "grad_norm": 2.7969677448272705, - "learning_rate": 4.986010500066258e-06, - "loss": 0.7623, - "step": 64 - }, - { - "epoch": 0.34759358288770054, - "grad_norm": 2.757786989212036, - "learning_rate": 4.985563295582292e-06, - "loss": 0.8051, - "step": 65 - }, - { - "epoch": 0.35294117647058826, - "grad_norm": 2.9582571983337402, - "learning_rate": 4.98510907587894e-06, - "loss": 0.7901, - "step": 66 - }, - { - "epoch": 0.3582887700534759, - "grad_norm": 3.104294776916504, - "learning_rate": 4.984647842238185e-06, - "loss": 1.0582, - "step": 67 - }, - { - "epoch": 0.36363636363636365, - "grad_norm": 2.7413785457611084, - "learning_rate": 4.984179595961806e-06, - "loss": 0.5912, - "step": 68 - }, - { - "epoch": 0.3689839572192513, - "grad_norm": 2.722858190536499, - "learning_rate": 4.983704338371375e-06, - "loss": 0.7855, - "step": 69 - }, - { - "epoch": 0.37433155080213903, - "grad_norm": 2.5095815658569336, - "learning_rate": 4.983222070808255e-06, - "loss": 0.6491, - "step": 70 - }, - { - "epoch": 0.37967914438502676, - "grad_norm": 2.97511887550354, - "learning_rate": 4.982732794633588e-06, - "loss": 0.9735, - "step": 71 - }, - { - "epoch": 0.3850267379679144, - "grad_norm": 3.5139546394348145, - "learning_rate": 4.982236511228301e-06, - "loss": 0.8495, - "step": 72 - }, - { - "epoch": 0.39037433155080214, - "grad_norm": 3.086568593978882, - "learning_rate": 4.981733221993099e-06, - "loss": 1.0891, - "step": 73 - }, - { - "epoch": 0.39572192513368987, - "grad_norm": 3.490666389465332, - "learning_rate": 4.981222928348456e-06, - "loss": 0.8013, - "step": 74 - }, - { - "epoch": 0.40106951871657753, - "grad_norm": 3.3275415897369385, - "learning_rate": 4.98070563173462e-06, - "loss": 0.8298, - "step": 75 - }, - { - "epoch": 0.40641711229946526, - "grad_norm": 2.7193403244018555, - "learning_rate": 4.980181333611601e-06, - "loss": 0.6989, - "step": 76 - }, - { - "epoch": 0.4117647058823529, - "grad_norm": 2.5338242053985596, - "learning_rate": 4.979650035459171e-06, - "loss": 0.6769, - "step": 77 - }, - { - "epoch": 0.41711229946524064, - "grad_norm": 2.7369015216827393, - "learning_rate": 4.9791117387768575e-06, - "loss": 1.0385, - "step": 78 - }, - { - "epoch": 0.42245989304812837, - "grad_norm": 2.6109988689422607, - "learning_rate": 4.978566445083942e-06, - "loss": 0.6498, - "step": 79 - }, - { - "epoch": 0.42780748663101603, - "grad_norm": 3.0895354747772217, - "learning_rate": 4.978014155919455e-06, - "loss": 0.7931, - "step": 80 - }, - { - "epoch": 0.43315508021390375, - "grad_norm": 2.6197807788848877, - "learning_rate": 4.977454872842169e-06, - "loss": 0.7322, - "step": 81 - }, - { - "epoch": 0.4385026737967914, - "grad_norm": 2.9248461723327637, - "learning_rate": 4.976888597430597e-06, - "loss": 0.9184, - "step": 82 - }, - { - "epoch": 0.44385026737967914, - "grad_norm": 2.7636630535125732, - "learning_rate": 4.976315331282985e-06, - "loss": 0.8258, - "step": 83 - }, - { - "epoch": 0.44919786096256686, - "grad_norm": 2.702061653137207, - "learning_rate": 4.9757350760173144e-06, - "loss": 0.7414, - "step": 84 - }, - { - "epoch": 0.45454545454545453, - "grad_norm": 2.3016257286071777, - "learning_rate": 4.975147833271288e-06, - "loss": 0.8573, - "step": 85 - }, - { - "epoch": 0.45989304812834225, - "grad_norm": 2.758795738220215, - "learning_rate": 4.974553604702332e-06, - "loss": 0.7271, - "step": 86 - }, - { - "epoch": 0.46524064171123, - "grad_norm": 3.0134952068328857, - "learning_rate": 4.973952391987589e-06, - "loss": 0.8976, - "step": 87 - }, - { - "epoch": 0.47058823529411764, - "grad_norm": 2.668630838394165, - "learning_rate": 4.9733441968239125e-06, - "loss": 1.0753, - "step": 88 - }, - { - "epoch": 0.47593582887700536, - "grad_norm": 2.5940303802490234, - "learning_rate": 4.972729020927866e-06, - "loss": 0.6903, - "step": 89 - }, - { - "epoch": 0.48128342245989303, - "grad_norm": 3.0423827171325684, - "learning_rate": 4.97210686603571e-06, - "loss": 0.9347, - "step": 90 - }, - { - "epoch": 0.48663101604278075, - "grad_norm": 2.5026450157165527, - "learning_rate": 4.97147773390341e-06, - "loss": 0.6738, - "step": 91 - }, - { - "epoch": 0.4919786096256685, - "grad_norm": 3.596545457839966, - "learning_rate": 4.970841626306617e-06, - "loss": 0.8356, - "step": 92 - }, - { - "epoch": 0.49732620320855614, - "grad_norm": 3.2207071781158447, - "learning_rate": 4.970198545040673e-06, - "loss": 0.9117, - "step": 93 - }, - { - "epoch": 0.5026737967914439, - "grad_norm": 2.858541965484619, - "learning_rate": 4.969548491920603e-06, - "loss": 0.8237, - "step": 94 - }, - { - "epoch": 0.5080213903743316, - "grad_norm": 2.896359920501709, - "learning_rate": 4.968891468781105e-06, - "loss": 0.8775, - "step": 95 - }, - { - "epoch": 0.5133689839572193, - "grad_norm": 3.6659083366394043, - "learning_rate": 4.968227477476554e-06, - "loss": 0.9068, - "step": 96 - }, - { - "epoch": 0.5187165775401069, - "grad_norm": 3.6469972133636475, - "learning_rate": 4.9675565198809905e-06, - "loss": 1.0435, - "step": 97 - }, - { - "epoch": 0.5240641711229946, - "grad_norm": 4.615362167358398, - "learning_rate": 4.966878597888114e-06, - "loss": 1.0084, - "step": 98 - }, - { - "epoch": 0.5294117647058824, - "grad_norm": 3.4075334072113037, - "learning_rate": 4.966193713411284e-06, - "loss": 0.7217, - "step": 99 - }, - { - "epoch": 0.5347593582887701, - "grad_norm": 2.8489456176757812, - "learning_rate": 4.965501868383507e-06, - "loss": 0.6594, - "step": 100 - }, - { - "epoch": 0.5401069518716578, - "grad_norm": 4.086977958679199, - "learning_rate": 4.964803064757438e-06, - "loss": 0.9249, - "step": 101 - }, - { - "epoch": 0.5454545454545454, - "grad_norm": 2.676903247833252, - "learning_rate": 4.964097304505371e-06, - "loss": 0.7776, - "step": 102 - }, - { - "epoch": 0.5508021390374331, - "grad_norm": 2.5098068714141846, - "learning_rate": 4.963384589619233e-06, - "loss": 0.6339, - "step": 103 - }, - { - "epoch": 0.5561497326203209, - "grad_norm": 4.064920902252197, - "learning_rate": 4.962664922110581e-06, - "loss": 1.0107, - "step": 104 - }, - { - "epoch": 0.5614973262032086, - "grad_norm": 2.6229960918426514, - "learning_rate": 4.9619383040105954e-06, - "loss": 1.0052, - "step": 105 - }, - { - "epoch": 0.5668449197860963, - "grad_norm": 2.857506275177002, - "learning_rate": 4.961204737370071e-06, - "loss": 0.8577, - "step": 106 - }, - { - "epoch": 0.5721925133689839, - "grad_norm": 3.9176764488220215, - "learning_rate": 4.960464224259418e-06, - "loss": 1.1237, - "step": 107 - }, - { - "epoch": 0.5775401069518716, - "grad_norm": 2.9063003063201904, - "learning_rate": 4.95971676676865e-06, - "loss": 0.6237, - "step": 108 - }, - { - "epoch": 0.5828877005347594, - "grad_norm": 3.1583969593048096, - "learning_rate": 4.958962367007381e-06, - "loss": 0.9135, - "step": 109 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.7559218406677246, - "learning_rate": 4.958201027104818e-06, - "loss": 0.7461, - "step": 110 - }, - { - "epoch": 0.5935828877005348, - "grad_norm": 11.086910247802734, - "learning_rate": 4.957432749209755e-06, - "loss": 0.69, - "step": 111 - }, - { - "epoch": 0.5989304812834224, - "grad_norm": 3.8109939098358154, - "learning_rate": 4.95665753549057e-06, - "loss": 0.8578, - "step": 112 - }, - { - "epoch": 0.6042780748663101, - "grad_norm": 3.3317348957061768, - "learning_rate": 4.9558753881352165e-06, - "loss": 1.3098, - "step": 113 - }, - { - "epoch": 0.6096256684491979, - "grad_norm": 2.715823173522949, - "learning_rate": 4.955086309351213e-06, - "loss": 0.9979, - "step": 114 - }, - { - "epoch": 0.6149732620320856, - "grad_norm": 2.798602819442749, - "learning_rate": 4.9542903013656485e-06, - "loss": 0.6298, - "step": 115 - }, - { - "epoch": 0.6203208556149733, - "grad_norm": 32.90562438964844, - "learning_rate": 4.953487366425163e-06, - "loss": 0.959, - "step": 116 - }, - { - "epoch": 0.6256684491978609, - "grad_norm": 4.012441158294678, - "learning_rate": 4.952677506795949e-06, - "loss": 0.6791, - "step": 117 - }, - { - "epoch": 0.6310160427807486, - "grad_norm": 3.548151731491089, - "learning_rate": 4.951860724763743e-06, - "loss": 0.7783, - "step": 118 - }, - { - "epoch": 0.6363636363636364, - "grad_norm": 3.4778249263763428, - "learning_rate": 4.95103702263382e-06, - "loss": 0.8085, - "step": 119 - }, - { - "epoch": 0.6417112299465241, - "grad_norm": 2.625532627105713, - "learning_rate": 4.950206402730984e-06, - "loss": 0.7702, - "step": 120 - }, - { - "epoch": 0.6470588235294118, - "grad_norm": 3.2743935585021973, - "learning_rate": 4.949368867399567e-06, - "loss": 0.602, - "step": 121 - }, - { - "epoch": 0.6524064171122995, - "grad_norm": 3.9576094150543213, - "learning_rate": 4.948524419003415e-06, - "loss": 1.2858, - "step": 122 - }, - { - "epoch": 0.6577540106951871, - "grad_norm": 3.233257532119751, - "learning_rate": 4.947673059925889e-06, - "loss": 0.7945, - "step": 123 - }, - { - "epoch": 0.6631016042780749, - "grad_norm": 2.6730406284332275, - "learning_rate": 4.9468147925698525e-06, - "loss": 0.959, - "step": 124 - }, - { - "epoch": 0.6684491978609626, - "grad_norm": 2.8612916469573975, - "learning_rate": 4.945949619357668e-06, - "loss": 0.7611, - "step": 125 - }, - { - "epoch": 0.6737967914438503, - "grad_norm": 2.9609551429748535, - "learning_rate": 4.945077542731188e-06, - "loss": 0.5753, - "step": 126 - }, - { - "epoch": 0.679144385026738, - "grad_norm": 3.7842485904693604, - "learning_rate": 4.94419856515175e-06, - "loss": 0.8995, - "step": 127 - }, - { - "epoch": 0.6844919786096256, - "grad_norm": 3.513170003890991, - "learning_rate": 4.943312689100166e-06, - "loss": 0.9623, - "step": 128 - }, - { - "epoch": 0.6898395721925134, - "grad_norm": 2.690305471420288, - "learning_rate": 4.942419917076723e-06, - "loss": 0.6657, - "step": 129 - }, - { - "epoch": 0.6951871657754011, - "grad_norm": 2.951237440109253, - "learning_rate": 4.941520251601167e-06, - "loss": 0.7711, - "step": 130 - }, - { - "epoch": 0.7005347593582888, - "grad_norm": 2.8285868167877197, - "learning_rate": 4.940613695212702e-06, - "loss": 0.5908, - "step": 131 - }, - { - "epoch": 0.7058823529411765, - "grad_norm": 2.6700541973114014, - "learning_rate": 4.939700250469979e-06, - "loss": 0.967, - "step": 132 - }, - { - "epoch": 0.7112299465240641, - "grad_norm": 3.229152202606201, - "learning_rate": 4.938779919951092e-06, - "loss": 0.9519, - "step": 133 - }, - { - "epoch": 0.7165775401069518, - "grad_norm": 2.403944730758667, - "learning_rate": 4.93785270625357e-06, - "loss": 0.5873, - "step": 134 - }, - { - "epoch": 0.7219251336898396, - "grad_norm": 3.8491666316986084, - "learning_rate": 4.936918611994368e-06, - "loss": 0.8148, - "step": 135 - }, - { - "epoch": 0.7272727272727273, - "grad_norm": 2.8255743980407715, - "learning_rate": 4.935977639809861e-06, - "loss": 0.8286, - "step": 136 - }, - { - "epoch": 0.732620320855615, - "grad_norm": 2.8479511737823486, - "learning_rate": 4.935029792355834e-06, - "loss": 0.6442, - "step": 137 - }, - { - "epoch": 0.7379679144385026, - "grad_norm": 2.585566759109497, - "learning_rate": 4.934075072307481e-06, - "loss": 1.0144, - "step": 138 - }, - { - "epoch": 0.7433155080213903, - "grad_norm": 2.8108413219451904, - "learning_rate": 4.933113482359388e-06, - "loss": 0.5922, - "step": 139 - }, - { - "epoch": 0.7486631016042781, - "grad_norm": 2.799546241760254, - "learning_rate": 4.932145025225535e-06, - "loss": 0.7546, - "step": 140 - }, - { - "epoch": 0.7540106951871658, - "grad_norm": 2.6492230892181396, - "learning_rate": 4.931169703639282e-06, - "loss": 0.8797, - "step": 141 - }, - { - "epoch": 0.7593582887700535, - "grad_norm": 4.130539417266846, - "learning_rate": 4.930187520353363e-06, - "loss": 0.865, - "step": 142 - }, - { - "epoch": 0.7647058823529411, - "grad_norm": 2.6537978649139404, - "learning_rate": 4.929198478139877e-06, - "loss": 0.6901, - "step": 143 - }, - { - "epoch": 0.7700534759358288, - "grad_norm": 2.488971710205078, - "learning_rate": 4.928202579790285e-06, - "loss": 0.5932, - "step": 144 - }, - { - "epoch": 0.7754010695187166, - "grad_norm": 2.4585540294647217, - "learning_rate": 4.927199828115395e-06, - "loss": 0.7742, - "step": 145 - }, - { - "epoch": 0.7807486631016043, - "grad_norm": 2.5525095462799072, - "learning_rate": 4.9261902259453616e-06, - "loss": 0.8475, - "step": 146 - }, - { - "epoch": 0.786096256684492, - "grad_norm": 3.032649040222168, - "learning_rate": 4.925173776129669e-06, - "loss": 1.0514, - "step": 147 - }, - { - "epoch": 0.7914438502673797, - "grad_norm": 2.4535398483276367, - "learning_rate": 4.9241504815371346e-06, - "loss": 0.5964, - "step": 148 - }, - { - "epoch": 0.7967914438502673, - "grad_norm": 2.2060890197753906, - "learning_rate": 4.923120345055887e-06, - "loss": 0.7615, - "step": 149 - }, - { - "epoch": 0.8021390374331551, - "grad_norm": 3.0113794803619385, - "learning_rate": 4.922083369593372e-06, - "loss": 0.6908, - "step": 150 - }, - { - "epoch": 0.8074866310160428, - "grad_norm": 2.6805336475372314, - "learning_rate": 4.921039558076335e-06, - "loss": 0.8661, - "step": 151 - }, - { - "epoch": 0.8128342245989305, - "grad_norm": 3.562213897705078, - "learning_rate": 4.919988913450812e-06, - "loss": 0.5267, - "step": 152 - }, - { - "epoch": 0.8181818181818182, - "grad_norm": 3.3453261852264404, - "learning_rate": 4.918931438682132e-06, - "loss": 0.9222, - "step": 153 - }, - { - "epoch": 0.8235294117647058, - "grad_norm": 2.7286977767944336, - "learning_rate": 4.917867136754894e-06, - "loss": 0.8865, - "step": 154 - }, - { - "epoch": 0.8288770053475936, - "grad_norm": 2.263981819152832, - "learning_rate": 4.916796010672969e-06, - "loss": 0.7262, - "step": 155 - }, - { - "epoch": 0.8342245989304813, - "grad_norm": 2.273568630218506, - "learning_rate": 4.91571806345949e-06, - "loss": 0.7611, - "step": 156 - }, - { - "epoch": 0.839572192513369, - "grad_norm": 3.0288827419281006, - "learning_rate": 4.91463329815684e-06, - "loss": 0.8745, - "step": 157 - }, - { - "epoch": 0.8449197860962567, - "grad_norm": 2.3675708770751953, - "learning_rate": 4.913541717826645e-06, - "loss": 0.6164, - "step": 158 - }, - { - "epoch": 0.8502673796791443, - "grad_norm": 2.2979559898376465, - "learning_rate": 4.912443325549767e-06, - "loss": 0.5549, - "step": 159 - }, - { - "epoch": 0.8556149732620321, - "grad_norm": 6.2421064376831055, - "learning_rate": 4.911338124426291e-06, - "loss": 0.9052, - "step": 160 - }, - { - "epoch": 0.8609625668449198, - "grad_norm": 2.125546932220459, - "learning_rate": 4.910226117575525e-06, - "loss": 0.7989, - "step": 161 - }, - { - "epoch": 0.8663101604278075, - "grad_norm": 2.8069941997528076, - "learning_rate": 4.909107308135978e-06, - "loss": 0.5915, - "step": 162 - }, - { - "epoch": 0.8716577540106952, - "grad_norm": 2.9329476356506348, - "learning_rate": 4.907981699265364e-06, - "loss": 0.6593, - "step": 163 - }, - { - "epoch": 0.8770053475935828, - "grad_norm": 3.8588013648986816, - "learning_rate": 4.906849294140587e-06, - "loss": 0.8739, - "step": 164 - }, - { - "epoch": 0.8823529411764706, - "grad_norm": 3.3252463340759277, - "learning_rate": 4.9057100959577285e-06, - "loss": 0.7314, - "step": 165 - }, - { - "epoch": 0.8877005347593583, - "grad_norm": 3.051591634750366, - "learning_rate": 4.904564107932048e-06, - "loss": 1.0109, - "step": 166 - }, - { - "epoch": 0.893048128342246, - "grad_norm": 2.8550548553466797, - "learning_rate": 4.903411333297966e-06, - "loss": 0.9092, - "step": 167 - }, - { - "epoch": 0.8983957219251337, - "grad_norm": 2.8500938415527344, - "learning_rate": 4.902251775309057e-06, - "loss": 0.7922, - "step": 168 - }, - { - "epoch": 0.9037433155080213, - "grad_norm": 3.3096566200256348, - "learning_rate": 4.901085437238041e-06, - "loss": 0.5955, - "step": 169 - }, - { - "epoch": 0.9090909090909091, - "grad_norm": 2.7365124225616455, - "learning_rate": 4.899912322376776e-06, - "loss": 1.0019, - "step": 170 - }, - { - "epoch": 0.9144385026737968, - "grad_norm": 2.3542861938476562, - "learning_rate": 4.8987324340362445e-06, - "loss": 0.8508, - "step": 171 - }, - { - "epoch": 0.9197860962566845, - "grad_norm": 2.822413921356201, - "learning_rate": 4.897545775546545e-06, - "loss": 0.8514, - "step": 172 - }, - { - "epoch": 0.9251336898395722, - "grad_norm": 2.528853416442871, - "learning_rate": 4.8963523502568886e-06, - "loss": 1.0263, - "step": 173 - }, - { - "epoch": 0.93048128342246, - "grad_norm": 3.7086899280548096, - "learning_rate": 4.895152161535582e-06, - "loss": 0.7929, - "step": 174 - }, - { - "epoch": 0.9358288770053476, - "grad_norm": 2.407613515853882, - "learning_rate": 4.893945212770019e-06, - "loss": 0.7227, - "step": 175 - }, - { - "epoch": 0.9411764705882353, - "grad_norm": 2.629978656768799, - "learning_rate": 4.892731507366678e-06, - "loss": 0.8923, - "step": 176 - }, - { - "epoch": 0.946524064171123, - "grad_norm": 2.281735897064209, - "learning_rate": 4.891511048751102e-06, - "loss": 0.7475, - "step": 177 - }, - { - "epoch": 0.9518716577540107, - "grad_norm": 2.8144044876098633, - "learning_rate": 4.890283840367898e-06, - "loss": 1.1405, - "step": 178 - }, - { - "epoch": 0.9572192513368984, - "grad_norm": 3.9945294857025146, - "learning_rate": 4.889049885680721e-06, - "loss": 0.8524, - "step": 179 - }, - { - "epoch": 0.9625668449197861, - "grad_norm": 2.9770278930664062, - "learning_rate": 4.887809188172268e-06, - "loss": 0.7617, - "step": 180 - }, - { - "epoch": 0.9679144385026738, - "grad_norm": 2.9451241493225098, - "learning_rate": 4.886561751344266e-06, - "loss": 0.8514, - "step": 181 - }, - { - "epoch": 0.9732620320855615, - "grad_norm": 2.670421600341797, - "learning_rate": 4.885307578717464e-06, - "loss": 0.8335, - "step": 182 - }, - { - "epoch": 0.9786096256684492, - "grad_norm": 2.565976858139038, - "learning_rate": 4.8840466738316216e-06, - "loss": 0.831, - "step": 183 - }, - { - "epoch": 0.983957219251337, - "grad_norm": 2.5326290130615234, - "learning_rate": 4.882779040245499e-06, - "loss": 0.7891, - "step": 184 - }, - { - "epoch": 0.9893048128342246, - "grad_norm": 2.524470090866089, - "learning_rate": 4.881504681536847e-06, - "loss": 0.6257, - "step": 185 - }, - { - "epoch": 0.9946524064171123, - "grad_norm": 2.3305137157440186, - "learning_rate": 4.880223601302398e-06, - "loss": 0.6008, - "step": 186 - }, - { - "epoch": 1.0, - "grad_norm": 3.0916237831115723, - "learning_rate": 4.878935803157856e-06, - "loss": 0.6061, - "step": 187 - }, - { - "epoch": 1.0053475935828877, - "grad_norm": 3.003761053085327, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.6628, - "step": 188 - }, - { - "epoch": 1.0106951871657754, - "grad_norm": 2.674351692199707, - "learning_rate": 4.876340067696097e-06, - "loss": 0.6124, - "step": 189 - }, - { - "epoch": 1.0160427807486632, - "grad_norm": 3.9263675212860107, - "learning_rate": 4.875032137705047e-06, - "loss": 0.7186, - "step": 190 - }, - { - "epoch": 1.0213903743315509, - "grad_norm": 3.006312370300293, - "learning_rate": 4.873717504456219e-06, - "loss": 0.7723, - "step": 191 - }, - { - "epoch": 1.0267379679144386, - "grad_norm": 2.5927529335021973, - "learning_rate": 4.872396171660014e-06, - "loss": 0.4069, - "step": 192 - }, - { - "epoch": 1.032085561497326, - "grad_norm": 3.193277597427368, - "learning_rate": 4.8710681430457466e-06, - "loss": 0.6705, - "step": 193 - }, - { - "epoch": 1.0374331550802138, - "grad_norm": 4.224829196929932, - "learning_rate": 4.8697334223616226e-06, - "loss": 0.8276, - "step": 194 - }, - { - "epoch": 1.0427807486631016, - "grad_norm": 3.008603096008301, - "learning_rate": 4.8683920133747405e-06, - "loss": 0.5913, - "step": 195 - }, - { - "epoch": 1.0481283422459893, - "grad_norm": 2.7365758419036865, - "learning_rate": 4.867043919871076e-06, - "loss": 0.5244, - "step": 196 - }, - { - "epoch": 1.053475935828877, - "grad_norm": 3.109424352645874, - "learning_rate": 4.865689145655467e-06, - "loss": 0.5962, - "step": 197 - }, - { - "epoch": 1.0588235294117647, - "grad_norm": 2.6860733032226562, - "learning_rate": 4.864327694551612e-06, - "loss": 0.5601, - "step": 198 - }, - { - "epoch": 1.0641711229946524, - "grad_norm": 3.3604085445404053, - "learning_rate": 4.86295957040205e-06, - "loss": 0.953, - "step": 199 - }, - { - "epoch": 1.0695187165775402, - "grad_norm": 3.981157064437866, - "learning_rate": 4.861584777068154e-06, - "loss": 0.7394, - "step": 200 - }, - { - "epoch": 1.0748663101604279, - "grad_norm": 3.687598943710327, - "learning_rate": 4.860203318430126e-06, - "loss": 0.3851, - "step": 201 - }, - { - "epoch": 1.0802139037433156, - "grad_norm": 2.9157185554504395, - "learning_rate": 4.858815198386973e-06, - "loss": 0.6595, - "step": 202 - }, - { - "epoch": 1.085561497326203, - "grad_norm": 2.805755853652954, - "learning_rate": 4.8574204208565056e-06, - "loss": 0.5308, - "step": 203 - }, - { - "epoch": 1.0909090909090908, - "grad_norm": 2.6051762104034424, - "learning_rate": 4.856018989775326e-06, - "loss": 0.5401, - "step": 204 - }, - { - "epoch": 1.0962566844919786, - "grad_norm": 2.8916220664978027, - "learning_rate": 4.854610909098813e-06, - "loss": 0.6365, - "step": 205 - }, - { - "epoch": 1.1016042780748663, - "grad_norm": 3.389765977859497, - "learning_rate": 4.853196182801112e-06, - "loss": 1.0949, - "step": 206 - }, - { - "epoch": 1.106951871657754, - "grad_norm": 2.910980701446533, - "learning_rate": 4.851774814875131e-06, - "loss": 0.6629, - "step": 207 - }, - { - "epoch": 1.1122994652406417, - "grad_norm": 2.8479011058807373, - "learning_rate": 4.850346809332515e-06, - "loss": 0.7166, - "step": 208 - }, - { - "epoch": 1.1176470588235294, - "grad_norm": 2.242565155029297, - "learning_rate": 4.8489121702036515e-06, - "loss": 0.7077, - "step": 209 - }, - { - "epoch": 1.1229946524064172, - "grad_norm": 2.833369731903076, - "learning_rate": 4.847470901537642e-06, - "loss": 0.6319, - "step": 210 - }, - { - "epoch": 1.1283422459893049, - "grad_norm": 3.053952217102051, - "learning_rate": 4.846023007402305e-06, - "loss": 0.5327, - "step": 211 - }, - { - "epoch": 1.1336898395721926, - "grad_norm": 3.0862181186676025, - "learning_rate": 4.844568491884156e-06, - "loss": 0.414, - "step": 212 - }, - { - "epoch": 1.1390374331550803, - "grad_norm": 2.6374268531799316, - "learning_rate": 4.843107359088402e-06, - "loss": 0.5933, - "step": 213 - }, - { - "epoch": 1.1443850267379678, - "grad_norm": 8.499526023864746, - "learning_rate": 4.84163961313892e-06, - "loss": 0.6844, - "step": 214 - }, - { - "epoch": 1.1497326203208555, - "grad_norm": 2.2556655406951904, - "learning_rate": 4.840165258178259e-06, - "loss": 0.5242, - "step": 215 - }, - { - "epoch": 1.1550802139037433, - "grad_norm": 2.8057925701141357, - "learning_rate": 4.838684298367616e-06, - "loss": 0.747, - "step": 216 - }, - { - "epoch": 1.160427807486631, - "grad_norm": 2.6920077800750732, - "learning_rate": 4.837196737886834e-06, - "loss": 0.7602, - "step": 217 - }, - { - "epoch": 1.1657754010695187, - "grad_norm": 3.1757941246032715, - "learning_rate": 4.83570258093438e-06, - "loss": 0.7525, - "step": 218 - }, - { - "epoch": 1.1711229946524064, - "grad_norm": 2.869535446166992, - "learning_rate": 4.834201831727343e-06, - "loss": 0.5111, - "step": 219 - }, - { - "epoch": 1.1764705882352942, - "grad_norm": 2.853529930114746, - "learning_rate": 4.832694494501417e-06, - "loss": 0.6215, - "step": 220 - }, - { - "epoch": 1.1818181818181819, - "grad_norm": 2.854609727859497, - "learning_rate": 4.83118057351089e-06, - "loss": 0.3931, - "step": 221 - }, - { - "epoch": 1.1871657754010696, - "grad_norm": 3.3581626415252686, - "learning_rate": 4.829660073028631e-06, - "loss": 0.6418, - "step": 222 - }, - { - "epoch": 1.192513368983957, - "grad_norm": 3.3372135162353516, - "learning_rate": 4.82813299734608e-06, - "loss": 0.5028, - "step": 223 - }, - { - "epoch": 1.1978609625668448, - "grad_norm": 3.1315996646881104, - "learning_rate": 4.826599350773234e-06, - "loss": 0.4452, - "step": 224 - }, - { - "epoch": 1.2032085561497325, - "grad_norm": 2.9624111652374268, - "learning_rate": 4.825059137638636e-06, - "loss": 0.7803, - "step": 225 - }, - { - "epoch": 1.2085561497326203, - "grad_norm": 3.0918056964874268, - "learning_rate": 4.823512362289362e-06, - "loss": 0.5968, - "step": 226 - }, - { - "epoch": 1.213903743315508, - "grad_norm": 2.905611276626587, - "learning_rate": 4.821959029091009e-06, - "loss": 0.5724, - "step": 227 - }, - { - "epoch": 1.2192513368983957, - "grad_norm": 2.967761278152466, - "learning_rate": 4.820399142427684e-06, - "loss": 0.5357, - "step": 228 - }, - { - "epoch": 1.2245989304812834, - "grad_norm": 3.3968875408172607, - "learning_rate": 4.818832706701989e-06, - "loss": 0.5743, - "step": 229 - }, - { - "epoch": 1.2299465240641712, - "grad_norm": 3.2088563442230225, - "learning_rate": 4.817259726335009e-06, - "loss": 0.8447, - "step": 230 - }, - { - "epoch": 1.2352941176470589, - "grad_norm": 2.8846428394317627, - "learning_rate": 4.815680205766304e-06, - "loss": 0.8136, - "step": 231 - }, - { - "epoch": 1.2406417112299466, - "grad_norm": 2.198012351989746, - "learning_rate": 4.814094149453891e-06, - "loss": 0.4073, - "step": 232 - }, - { - "epoch": 1.2459893048128343, - "grad_norm": 3.148988962173462, - "learning_rate": 4.812501561874232e-06, - "loss": 0.6625, - "step": 233 - }, - { - "epoch": 1.251336898395722, - "grad_norm": 2.770563840866089, - "learning_rate": 4.8109024475222255e-06, - "loss": 0.6403, - "step": 234 - }, - { - "epoch": 1.2566844919786098, - "grad_norm": 3.157482147216797, - "learning_rate": 4.809296810911188e-06, - "loss": 0.8436, - "step": 235 - }, - { - "epoch": 1.2620320855614973, - "grad_norm": 3.0236425399780273, - "learning_rate": 4.8076846565728475e-06, - "loss": 0.8578, - "step": 236 - }, - { - "epoch": 1.267379679144385, - "grad_norm": 2.508145570755005, - "learning_rate": 4.806065989057326e-06, - "loss": 0.5431, - "step": 237 - }, - { - "epoch": 1.2727272727272727, - "grad_norm": 3.171482563018799, - "learning_rate": 4.8044408129331266e-06, - "loss": 0.4613, - "step": 238 - }, - { - "epoch": 1.2780748663101604, - "grad_norm": 3.209517240524292, - "learning_rate": 4.802809132787125e-06, - "loss": 0.6743, - "step": 239 - }, - { - "epoch": 1.2834224598930482, - "grad_norm": 2.8249428272247314, - "learning_rate": 4.801170953224554e-06, - "loss": 0.8116, - "step": 240 - }, - { - "epoch": 1.2887700534759359, - "grad_norm": 2.2719192504882812, - "learning_rate": 4.7995262788689865e-06, - "loss": 0.4008, - "step": 241 - }, - { - "epoch": 1.2941176470588236, - "grad_norm": 3.2883615493774414, - "learning_rate": 4.797875114362331e-06, - "loss": 0.5953, - "step": 242 - }, - { - "epoch": 1.299465240641711, - "grad_norm": 17.095844268798828, - "learning_rate": 4.796217464364808e-06, - "loss": 0.8779, - "step": 243 - }, - { - "epoch": 1.3048128342245988, - "grad_norm": 3.6116573810577393, - "learning_rate": 4.794553333554949e-06, - "loss": 0.7568, - "step": 244 - }, - { - "epoch": 1.3101604278074865, - "grad_norm": 2.622695207595825, - "learning_rate": 4.792882726629572e-06, - "loss": 0.5016, - "step": 245 - }, - { - "epoch": 1.3155080213903743, - "grad_norm": 8.820343017578125, - "learning_rate": 4.791205648303775e-06, - "loss": 0.8415, - "step": 246 - }, - { - "epoch": 1.320855614973262, - "grad_norm": 2.8980658054351807, - "learning_rate": 4.789522103310922e-06, - "loss": 0.6032, - "step": 247 - }, - { - "epoch": 1.3262032085561497, - "grad_norm": 2.6704914569854736, - "learning_rate": 4.787832096402626e-06, - "loss": 0.6548, - "step": 248 - }, - { - "epoch": 1.3315508021390374, - "grad_norm": 3.3483593463897705, - "learning_rate": 4.786135632348738e-06, - "loss": 0.6212, - "step": 249 - }, - { - "epoch": 1.3368983957219251, - "grad_norm": 2.6832988262176514, - "learning_rate": 4.7844327159373365e-06, - "loss": 0.8052, - "step": 250 - }, - { - "epoch": 1.3422459893048129, - "grad_norm": 2.599897623062134, - "learning_rate": 4.782723351974708e-06, - "loss": 0.589, - "step": 251 - }, - { - "epoch": 1.3475935828877006, - "grad_norm": 3.2921037673950195, - "learning_rate": 4.7810075452853385e-06, - "loss": 0.63, - "step": 252 - }, - { - "epoch": 1.3529411764705883, - "grad_norm": 2.5389389991760254, - "learning_rate": 4.779285300711897e-06, - "loss": 0.6727, - "step": 253 - }, - { - "epoch": 1.358288770053476, - "grad_norm": 2.817018985748291, - "learning_rate": 4.7775566231152216e-06, - "loss": 0.4158, - "step": 254 - }, - { - "epoch": 1.3636363636363638, - "grad_norm": 2.749091863632202, - "learning_rate": 4.775821517374308e-06, - "loss": 0.8809, - "step": 255 - }, - { - "epoch": 1.3689839572192513, - "grad_norm": 2.599484443664551, - "learning_rate": 4.7740799883862966e-06, - "loss": 0.5157, - "step": 256 - }, - { - "epoch": 1.374331550802139, - "grad_norm": 2.412386417388916, - "learning_rate": 4.772332041066452e-06, - "loss": 0.4467, - "step": 257 - }, - { - "epoch": 1.3796791443850267, - "grad_norm": 2.713000774383545, - "learning_rate": 4.770577680348159e-06, - "loss": 0.9125, - "step": 258 - }, - { - "epoch": 1.3850267379679144, - "grad_norm": 3.22122859954834, - "learning_rate": 4.768816911182899e-06, - "loss": 0.4665, - "step": 259 - }, - { - "epoch": 1.3903743315508021, - "grad_norm": 2.9274754524230957, - "learning_rate": 4.767049738540244e-06, - "loss": 0.5404, - "step": 260 - }, - { - "epoch": 1.3957219251336899, - "grad_norm": 2.2020022869110107, - "learning_rate": 4.765276167407836e-06, - "loss": 0.4575, - "step": 261 - }, - { - "epoch": 1.4010695187165776, - "grad_norm": 3.0807480812072754, - "learning_rate": 4.7634962027913784e-06, - "loss": 0.8227, - "step": 262 - }, - { - "epoch": 1.4064171122994653, - "grad_norm": 2.655407667160034, - "learning_rate": 4.761709849714619e-06, - "loss": 0.5813, - "step": 263 - }, - { - "epoch": 1.4117647058823528, - "grad_norm": 2.580695152282715, - "learning_rate": 4.7599171132193355e-06, - "loss": 0.6333, - "step": 264 - }, - { - "epoch": 1.4171122994652405, - "grad_norm": 2.8121836185455322, - "learning_rate": 4.7581179983653224e-06, - "loss": 0.6368, - "step": 265 - }, - { - "epoch": 1.4224598930481283, - "grad_norm": 3.2582831382751465, - "learning_rate": 4.756312510230377e-06, - "loss": 0.4146, - "step": 266 - }, - { - "epoch": 1.427807486631016, - "grad_norm": 3.0589146614074707, - "learning_rate": 4.754500653910284e-06, - "loss": 0.6066, - "step": 267 - }, - { - "epoch": 1.4331550802139037, - "grad_norm": 3.0196666717529297, - "learning_rate": 4.752682434518801e-06, - "loss": 0.6254, - "step": 268 - }, - { - "epoch": 1.4385026737967914, - "grad_norm": 2.9189376831054688, - "learning_rate": 4.750857857187645e-06, - "loss": 0.4853, - "step": 269 - }, - { - "epoch": 1.4438502673796791, - "grad_norm": 2.299985885620117, - "learning_rate": 4.749026927066479e-06, - "loss": 0.7066, - "step": 270 - }, - { - "epoch": 1.4491978609625669, - "grad_norm": 2.0745482444763184, - "learning_rate": 4.747189649322894e-06, - "loss": 0.5224, - "step": 271 - }, - { - "epoch": 1.4545454545454546, - "grad_norm": 3.8428823947906494, - "learning_rate": 4.745346029142397e-06, - "loss": 0.7391, - "step": 272 - }, - { - "epoch": 1.4598930481283423, - "grad_norm": 2.409541368484497, - "learning_rate": 4.743496071728396e-06, - "loss": 0.6529, - "step": 273 - }, - { - "epoch": 1.46524064171123, - "grad_norm": 2.810421943664551, - "learning_rate": 4.741639782302187e-06, - "loss": 0.453, - "step": 274 - }, - { - "epoch": 1.4705882352941178, - "grad_norm": 2.9112162590026855, - "learning_rate": 4.739777166102933e-06, - "loss": 0.5275, - "step": 275 - }, - { - "epoch": 1.4759358288770055, - "grad_norm": 2.653869390487671, - "learning_rate": 4.737908228387656e-06, - "loss": 0.5838, - "step": 276 - }, - { - "epoch": 1.481283422459893, - "grad_norm": 2.7957050800323486, - "learning_rate": 4.736032974431222e-06, - "loss": 0.5719, - "step": 277 - }, - { - "epoch": 1.4866310160427807, - "grad_norm": 2.4398281574249268, - "learning_rate": 4.7341514095263214e-06, - "loss": 0.4318, - "step": 278 - }, - { - "epoch": 1.4919786096256684, - "grad_norm": 3.5739479064941406, - "learning_rate": 4.732263538983456e-06, - "loss": 0.6388, - "step": 279 - }, - { - "epoch": 1.4973262032085561, - "grad_norm": 3.433971405029297, - "learning_rate": 4.730369368130925e-06, - "loss": 0.6673, - "step": 280 - }, - { - "epoch": 1.5026737967914439, - "grad_norm": 3.205761432647705, - "learning_rate": 4.728468902314811e-06, - "loss": 1.2311, - "step": 281 - }, - { - "epoch": 1.5080213903743316, - "grad_norm": 2.8073904514312744, - "learning_rate": 4.726562146898963e-06, - "loss": 0.6467, - "step": 282 - }, - { - "epoch": 1.5133689839572193, - "grad_norm": 3.282175064086914, - "learning_rate": 4.72464910726498e-06, - "loss": 0.6265, - "step": 283 - }, - { - "epoch": 1.5187165775401068, - "grad_norm": 3.5575335025787354, - "learning_rate": 4.7227297888121985e-06, - "loss": 0.8415, - "step": 284 - }, - { - "epoch": 1.5240641711229945, - "grad_norm": 2.851593255996704, - "learning_rate": 4.720804196957676e-06, - "loss": 0.6441, - "step": 285 - }, - { - "epoch": 1.5294117647058822, - "grad_norm": 2.8091742992401123, - "learning_rate": 4.718872337136176e-06, - "loss": 0.8297, - "step": 286 - }, - { - "epoch": 1.53475935828877, - "grad_norm": 2.456247091293335, - "learning_rate": 4.716934214800155e-06, - "loss": 0.9988, - "step": 287 - }, - { - "epoch": 1.5401069518716577, - "grad_norm": 2.6044399738311768, - "learning_rate": 4.714989835419741e-06, - "loss": 0.5931, - "step": 288 - }, - { - "epoch": 1.5454545454545454, - "grad_norm": 3.5424976348876953, - "learning_rate": 4.713039204482723e-06, - "loss": 0.5902, - "step": 289 - }, - { - "epoch": 1.5508021390374331, - "grad_norm": 3.1387109756469727, - "learning_rate": 4.711082327494536e-06, - "loss": 0.7356, - "step": 290 - }, - { - "epoch": 1.5561497326203209, - "grad_norm": 3.1310863494873047, - "learning_rate": 4.709119209978242e-06, - "loss": 0.529, - "step": 291 - }, - { - "epoch": 1.5614973262032086, - "grad_norm": 2.778148651123047, - "learning_rate": 4.707149857474516e-06, - "loss": 0.4536, - "step": 292 - }, - { - "epoch": 1.5668449197860963, - "grad_norm": 2.308875560760498, - "learning_rate": 4.705174275541632e-06, - "loss": 0.5565, - "step": 293 - }, - { - "epoch": 1.572192513368984, - "grad_norm": 2.531953811645508, - "learning_rate": 4.703192469755444e-06, - "loss": 0.728, - "step": 294 - }, - { - "epoch": 1.5775401069518717, - "grad_norm": 2.6498258113861084, - "learning_rate": 4.701204445709375e-06, - "loss": 0.6269, - "step": 295 - }, - { - "epoch": 1.5828877005347595, - "grad_norm": 2.500495195388794, - "learning_rate": 4.699210209014394e-06, - "loss": 0.658, - "step": 296 - }, - { - "epoch": 1.5882352941176472, - "grad_norm": 2.733893394470215, - "learning_rate": 4.69720976529901e-06, - "loss": 0.5184, - "step": 297 - }, - { - "epoch": 1.593582887700535, - "grad_norm": 2.8712120056152344, - "learning_rate": 4.695203120209245e-06, - "loss": 0.5321, - "step": 298 - }, - { - "epoch": 1.5989304812834224, - "grad_norm": 2.467778205871582, - "learning_rate": 4.693190279408628e-06, - "loss": 0.4647, - "step": 299 - }, - { - "epoch": 1.6042780748663101, - "grad_norm": 2.4705379009246826, - "learning_rate": 4.691171248578172e-06, - "loss": 0.4889, - "step": 300 - }, - { - "epoch": 1.6096256684491979, - "grad_norm": 2.4136300086975098, - "learning_rate": 4.689146033416362e-06, - "loss": 0.6621, - "step": 301 - }, - { - "epoch": 1.6149732620320856, - "grad_norm": 2.042703151702881, - "learning_rate": 4.687114639639136e-06, - "loss": 0.4009, - "step": 302 - }, - { - "epoch": 1.6203208556149733, - "grad_norm": 3.224032402038574, - "learning_rate": 4.685077072979874e-06, - "loss": 0.5065, - "step": 303 - }, - { - "epoch": 1.6256684491978608, - "grad_norm": 3.0109472274780273, - "learning_rate": 4.683033339189375e-06, - "loss": 0.5289, - "step": 304 - }, - { - "epoch": 1.6310160427807485, - "grad_norm": 2.7306134700775146, - "learning_rate": 4.680983444035843e-06, - "loss": 0.7078, - "step": 305 - }, - { - "epoch": 1.6363636363636362, - "grad_norm": 3.4351847171783447, - "learning_rate": 4.678927393304877e-06, - "loss": 0.4003, - "step": 306 - }, - { - "epoch": 1.641711229946524, - "grad_norm": 2.6287615299224854, - "learning_rate": 4.676865192799443e-06, - "loss": 0.4802, - "step": 307 - }, - { - "epoch": 1.6470588235294117, - "grad_norm": 2.7532455921173096, - "learning_rate": 4.6747968483398695e-06, - "loss": 0.8128, - "step": 308 - }, - { - "epoch": 1.6524064171122994, - "grad_norm": 2.49472975730896, - "learning_rate": 4.672722365763821e-06, - "loss": 0.4085, - "step": 309 - }, - { - "epoch": 1.6577540106951871, - "grad_norm": 2.805548667907715, - "learning_rate": 4.6706417509262905e-06, - "loss": 0.5707, - "step": 310 - }, - { - "epoch": 1.6631016042780749, - "grad_norm": 3.333185911178589, - "learning_rate": 4.668555009699575e-06, - "loss": 0.481, - "step": 311 - }, - { - "epoch": 1.6684491978609626, - "grad_norm": 2.704253673553467, - "learning_rate": 4.666462147973264e-06, - "loss": 0.6021, - "step": 312 - }, - { - "epoch": 1.6737967914438503, - "grad_norm": 3.070093870162964, - "learning_rate": 4.664363171654223e-06, - "loss": 0.7208, - "step": 313 - }, - { - "epoch": 1.679144385026738, - "grad_norm": 3.5783073902130127, - "learning_rate": 4.662258086666571e-06, - "loss": 0.9136, - "step": 314 - }, - { - "epoch": 1.6844919786096257, - "grad_norm": 2.5549259185791016, - "learning_rate": 4.660146898951674e-06, - "loss": 0.7375, - "step": 315 - }, - { - "epoch": 1.6898395721925135, - "grad_norm": 3.192612886428833, - "learning_rate": 4.6580296144681155e-06, - "loss": 0.6786, - "step": 316 - }, - { - "epoch": 1.6951871657754012, - "grad_norm": 4.031966209411621, - "learning_rate": 4.655906239191693e-06, - "loss": 0.789, - "step": 317 - }, - { - "epoch": 1.700534759358289, - "grad_norm": 2.8713667392730713, - "learning_rate": 4.653776779115389e-06, - "loss": 0.7104, - "step": 318 - }, - { - "epoch": 1.7058823529411766, - "grad_norm": 7.210184097290039, - "learning_rate": 4.651641240249364e-06, - "loss": 0.5165, - "step": 319 - }, - { - "epoch": 1.7112299465240641, - "grad_norm": 2.636258602142334, - "learning_rate": 4.649499628620931e-06, - "loss": 0.4081, - "step": 320 - }, - { - "epoch": 1.7165775401069518, - "grad_norm": 2.4294848442077637, - "learning_rate": 4.647351950274548e-06, - "loss": 0.6536, - "step": 321 - }, - { - "epoch": 1.7219251336898396, - "grad_norm": 2.551454544067383, - "learning_rate": 4.6451982112717896e-06, - "loss": 0.6597, - "step": 322 - }, - { - "epoch": 1.7272727272727273, - "grad_norm": 8.412546157836914, - "learning_rate": 4.643038417691341e-06, - "loss": 0.7608, - "step": 323 - }, - { - "epoch": 1.732620320855615, - "grad_norm": 2.47556734085083, - "learning_rate": 4.640872575628973e-06, - "loss": 0.4597, - "step": 324 - }, - { - "epoch": 1.7379679144385025, - "grad_norm": 3.3347442150115967, - "learning_rate": 4.6387006911975275e-06, - "loss": 0.7241, - "step": 325 - }, - { - "epoch": 1.7433155080213902, - "grad_norm": 3.182422637939453, - "learning_rate": 4.6365227705269026e-06, - "loss": 0.7654, - "step": 326 - }, - { - "epoch": 1.748663101604278, - "grad_norm": 2.947328805923462, - "learning_rate": 4.634338819764029e-06, - "loss": 0.6391, - "step": 327 - }, - { - "epoch": 1.7540106951871657, - "grad_norm": 3.1109538078308105, - "learning_rate": 4.632148845072861e-06, - "loss": 0.5501, - "step": 328 - }, - { - "epoch": 1.7593582887700534, - "grad_norm": 3.0903382301330566, - "learning_rate": 4.6299528526343525e-06, - "loss": 0.6117, - "step": 329 - }, - { - "epoch": 1.7647058823529411, - "grad_norm": 3.745351552963257, - "learning_rate": 4.627750848646443e-06, - "loss": 0.8534, - "step": 330 - }, - { - "epoch": 1.7700534759358288, - "grad_norm": 3.4808154106140137, - "learning_rate": 4.625542839324036e-06, - "loss": 0.6352, - "step": 331 - }, - { - "epoch": 1.7754010695187166, - "grad_norm": 2.984961748123169, - "learning_rate": 4.6233288308989874e-06, - "loss": 0.4188, - "step": 332 - }, - { - "epoch": 1.7807486631016043, - "grad_norm": 2.6888809204101562, - "learning_rate": 4.6211088296200834e-06, - "loss": 0.4464, - "step": 333 - }, - { - "epoch": 1.786096256684492, - "grad_norm": 2.868077039718628, - "learning_rate": 4.618882841753026e-06, - "loss": 0.6833, - "step": 334 - }, - { - "epoch": 1.7914438502673797, - "grad_norm": 2.7746901512145996, - "learning_rate": 4.616650873580411e-06, - "loss": 0.6356, - "step": 335 - }, - { - "epoch": 1.7967914438502675, - "grad_norm": 3.0901777744293213, - "learning_rate": 4.614412931401715e-06, - "loss": 0.5413, - "step": 336 - }, - { - "epoch": 1.8021390374331552, - "grad_norm": 3.2670090198516846, - "learning_rate": 4.612169021533276e-06, - "loss": 0.5275, - "step": 337 - }, - { - "epoch": 1.807486631016043, - "grad_norm": 2.9879071712493896, - "learning_rate": 4.609919150308273e-06, - "loss": 0.6292, - "step": 338 - }, - { - "epoch": 1.8128342245989306, - "grad_norm": 2.9089176654815674, - "learning_rate": 4.607663324076711e-06, - "loss": 0.5315, - "step": 339 - }, - { - "epoch": 1.8181818181818183, - "grad_norm": 2.698115587234497, - "learning_rate": 4.605401549205404e-06, - "loss": 0.7492, - "step": 340 - }, - { - "epoch": 1.8235294117647058, - "grad_norm": 3.423445463180542, - "learning_rate": 4.603133832077953e-06, - "loss": 0.6453, - "step": 341 - }, - { - "epoch": 1.8288770053475936, - "grad_norm": 2.504528045654297, - "learning_rate": 4.600860179094732e-06, - "loss": 0.6502, - "step": 342 - }, - { - "epoch": 1.8342245989304813, - "grad_norm": 3.743797540664673, - "learning_rate": 4.5985805966728675e-06, - "loss": 0.6807, - "step": 343 - }, - { - "epoch": 1.839572192513369, - "grad_norm": 2.732316732406616, - "learning_rate": 4.596295091246221e-06, - "loss": 0.5235, - "step": 344 - }, - { - "epoch": 1.8449197860962567, - "grad_norm": 2.681244134902954, - "learning_rate": 4.594003669265371e-06, - "loss": 0.5847, - "step": 345 - }, - { - "epoch": 1.8502673796791442, - "grad_norm": 2.7608835697174072, - "learning_rate": 4.591706337197597e-06, - "loss": 0.6266, - "step": 346 - }, - { - "epoch": 1.855614973262032, - "grad_norm": 3.0770840644836426, - "learning_rate": 4.589403101526854e-06, - "loss": 0.5021, - "step": 347 - }, - { - "epoch": 1.8609625668449197, - "grad_norm": 2.7511236667633057, - "learning_rate": 4.587093968753765e-06, - "loss": 0.6426, - "step": 348 - }, - { - "epoch": 1.8663101604278074, - "grad_norm": 2.199262857437134, - "learning_rate": 4.584778945395594e-06, - "loss": 0.41, - "step": 349 - }, - { - "epoch": 1.8716577540106951, - "grad_norm": 4.125847816467285, - "learning_rate": 4.582458037986231e-06, - "loss": 0.6775, - "step": 350 - }, - { - "epoch": 1.8770053475935828, - "grad_norm": 3.585446834564209, - "learning_rate": 4.580131253076171e-06, - "loss": 0.9407, - "step": 351 - }, - { - "epoch": 1.8823529411764706, - "grad_norm": 3.3022289276123047, - "learning_rate": 4.5777985972325016e-06, - "loss": 0.6412, - "step": 352 - }, - { - "epoch": 1.8877005347593583, - "grad_norm": 2.9012153148651123, - "learning_rate": 4.575460077038877e-06, - "loss": 0.4353, - "step": 353 - }, - { - "epoch": 1.893048128342246, - "grad_norm": 3.134577989578247, - "learning_rate": 4.573115699095505e-06, - "loss": 0.934, - "step": 354 - }, - { - "epoch": 1.8983957219251337, - "grad_norm": 2.8544585704803467, - "learning_rate": 4.570765470019125e-06, - "loss": 0.472, - "step": 355 - }, - { - "epoch": 1.9037433155080214, - "grad_norm": 3.232541084289551, - "learning_rate": 4.5684093964429906e-06, - "loss": 0.6079, - "step": 356 - }, - { - "epoch": 1.9090909090909092, - "grad_norm": 2.760040044784546, - "learning_rate": 4.566047485016853e-06, - "loss": 0.4644, - "step": 357 - }, - { - "epoch": 1.914438502673797, - "grad_norm": 3.5607728958129883, - "learning_rate": 4.563679742406935e-06, - "loss": 0.721, - "step": 358 - }, - { - "epoch": 1.9197860962566846, - "grad_norm": 2.364783763885498, - "learning_rate": 4.5613061752959236e-06, - "loss": 1.0296, - "step": 359 - }, - { - "epoch": 1.9251336898395723, - "grad_norm": 3.172856092453003, - "learning_rate": 4.558926790382941e-06, - "loss": 0.892, - "step": 360 - }, - { - "epoch": 1.93048128342246, - "grad_norm": 3.1738357543945312, - "learning_rate": 4.556541594383528e-06, - "loss": 0.6153, - "step": 361 - }, - { - "epoch": 1.9358288770053476, - "grad_norm": 2.396540880203247, - "learning_rate": 4.554150594029631e-06, - "loss": 0.3246, - "step": 362 - }, - { - "epoch": 1.9411764705882353, - "grad_norm": 2.347179651260376, - "learning_rate": 4.551753796069577e-06, - "loss": 0.5986, - "step": 363 - }, - { - "epoch": 1.946524064171123, - "grad_norm": 2.559436082839966, - "learning_rate": 4.5493512072680535e-06, - "loss": 0.5642, - "step": 364 - }, - { - "epoch": 1.9518716577540107, - "grad_norm": 2.5733461380004883, - "learning_rate": 4.546942834406094e-06, - "loss": 0.7661, - "step": 365 - }, - { - "epoch": 1.9572192513368984, - "grad_norm": 2.9867851734161377, - "learning_rate": 4.544528684281056e-06, - "loss": 0.4739, - "step": 366 - }, - { - "epoch": 1.962566844919786, - "grad_norm": 2.3558244705200195, - "learning_rate": 4.5421087637066065e-06, - "loss": 0.4551, - "step": 367 - }, - { - "epoch": 1.9679144385026737, - "grad_norm": 2.438739061355591, - "learning_rate": 4.539683079512692e-06, - "loss": 0.7336, - "step": 368 - }, - { - "epoch": 1.9732620320855614, - "grad_norm": 2.9113192558288574, - "learning_rate": 4.537251638545532e-06, - "loss": 0.5833, - "step": 369 - }, - { - "epoch": 1.9786096256684491, - "grad_norm": 2.915750741958618, - "learning_rate": 4.534814447667591e-06, - "loss": 0.3305, - "step": 370 - }, - { - "epoch": 1.9839572192513368, - "grad_norm": 2.14119815826416, - "learning_rate": 4.532371513757564e-06, - "loss": 0.4912, - "step": 371 - }, - { - "epoch": 1.9893048128342246, - "grad_norm": 2.589812994003296, - "learning_rate": 4.529922843710354e-06, - "loss": 0.611, - "step": 372 - }, - { - "epoch": 1.9946524064171123, - "grad_norm": 2.771989345550537, - "learning_rate": 4.52746844443705e-06, - "loss": 0.6487, - "step": 373 - }, - { - "epoch": 2.0, - "grad_norm": 2.7459375858306885, - "learning_rate": 4.525008322864917e-06, - "loss": 0.607, - "step": 374 - } - ], - "logging_steps": 1, - "max_steps": 1870, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 9.940141544046592e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_combined/trainer_log.jsonl b/metallama3_8b/limo_filtered_combined/trainer_log.jsonl deleted file mode 100644 index e2743dae8b346b54055f6f24552031f52db4b035..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_combined/trainer_log.jsonl +++ /dev/null @@ -1,480 +0,0 @@ -{"current_steps": 1, "total_steps": 1870, "loss": 0.9394, "lr": 5e-06, "epoch": 0.0053475935828877, "percentage": 0.05, "elapsed_time": "0:00:02", "remaining_time": "1:15:32"} -{"current_steps": 2, "total_steps": 1870, "loss": 2.0122, "lr": 4.99999647201733e-06, "epoch": 0.0106951871657754, "percentage": 0.11, "elapsed_time": "0:00:05", "remaining_time": "1:28:59"} -{"current_steps": 3, "total_steps": 1870, "loss": 1.0092, "lr": 4.999985888079276e-06, "epoch": 0.016042780748663103, "percentage": 0.16, "elapsed_time": "0:00:07", "remaining_time": "1:14:30"} -{"current_steps": 4, "total_steps": 1870, "loss": 1.5196, "lr": 4.999968248215712e-06, "epoch": 0.0213903743315508, "percentage": 0.21, "elapsed_time": "0:00:12", "remaining_time": "1:33:26"} -{"current_steps": 5, "total_steps": 1870, "loss": 1.4586, "lr": 4.999943552476422e-06, "epoch": 0.026737967914438502, "percentage": 0.27, "elapsed_time": "0:00:15", "remaining_time": "1:37:56"} -{"current_steps": 6, "total_steps": 1870, "loss": 1.1068, "lr": 4.999911800931108e-06, "epoch": 0.03208556149732621, "percentage": 0.32, "elapsed_time": "0:00:17", "remaining_time": "1:32:10"} -{"current_steps": 7, "total_steps": 1870, "loss": 0.8997, "lr": 4.999872993669387e-06, "epoch": 0.0374331550802139, "percentage": 0.37, "elapsed_time": "0:00:19", "remaining_time": "1:25:14"} -{"current_steps": 8, "total_steps": 1870, "loss": 1.075, "lr": 4.999827130800785e-06, "epoch": 0.0427807486631016, "percentage": 0.43, "elapsed_time": "0:00:21", "remaining_time": "1:23:32"} -{"current_steps": 9, "total_steps": 1870, "loss": 1.691, "lr": 4.999774212454746e-06, "epoch": 0.0481283422459893, "percentage": 0.48, "elapsed_time": "0:00:24", "remaining_time": "1:25:50"} -{"current_steps": 10, "total_steps": 1870, "loss": 1.3167, "lr": 4.999714238780626e-06, "epoch": 0.053475935828877004, "percentage": 0.53, "elapsed_time": "0:00:32", "remaining_time": "1:39:43"} -{"current_steps": 11, "total_steps": 1870, "loss": 0.9653, "lr": 4.999647209947694e-06, "epoch": 0.058823529411764705, "percentage": 0.59, "elapsed_time": "0:00:35", "remaining_time": "1:39:46"} -{"current_steps": 12, "total_steps": 1870, "loss": 1.2992, "lr": 4.999573126145132e-06, "epoch": 0.06417112299465241, "percentage": 0.64, "elapsed_time": "0:00:38", "remaining_time": "1:40:28"} -{"current_steps": 13, "total_steps": 1870, "loss": 0.9204, "lr": 4.999491987582032e-06, "epoch": 0.06951871657754011, "percentage": 0.7, "elapsed_time": "0:00:43", "remaining_time": "1:42:58"} -{"current_steps": 14, "total_steps": 1870, "loss": 1.3307, "lr": 4.999403794487399e-06, "epoch": 0.0748663101604278, "percentage": 0.75, "elapsed_time": "0:00:48", "remaining_time": "1:48:07"} -{"current_steps": 15, "total_steps": 1870, "loss": 0.8596, "lr": 4.999308547110147e-06, "epoch": 0.08021390374331551, "percentage": 0.8, "elapsed_time": "0:00:50", "remaining_time": "1:45:02"} -{"current_steps": 16, "total_steps": 1870, "loss": 0.9614, "lr": 4.9992062457191005e-06, "epoch": 0.0855614973262032, "percentage": 0.86, "elapsed_time": "0:00:53", "remaining_time": "1:44:08"} -{"current_steps": 17, "total_steps": 1870, "loss": 0.8, "lr": 4.999096890602996e-06, "epoch": 0.09090909090909091, "percentage": 0.91, "elapsed_time": "0:00:57", "remaining_time": "1:45:20"} -{"current_steps": 18, "total_steps": 1870, "loss": 0.7683, "lr": 4.998980482070473e-06, "epoch": 0.0962566844919786, "percentage": 0.96, "elapsed_time": "0:01:01", "remaining_time": "1:45:59"} -{"current_steps": 19, "total_steps": 1870, "loss": 1.2742, "lr": 4.998857020450084e-06, "epoch": 0.10160427807486631, "percentage": 1.02, "elapsed_time": "0:01:04", "remaining_time": "1:45:01"} -{"current_steps": 20, "total_steps": 1870, "loss": 0.8559, "lr": 4.998726506090283e-06, "epoch": 0.10695187165775401, "percentage": 1.07, "elapsed_time": "0:01:06", "remaining_time": "1:43:14"} -{"current_steps": 21, "total_steps": 1870, "loss": 0.8223, "lr": 4.998588939359435e-06, "epoch": 0.11229946524064172, "percentage": 1.12, "elapsed_time": "0:01:08", "remaining_time": "1:40:41"} -{"current_steps": 22, "total_steps": 1870, "loss": 1.1229, "lr": 4.998444320645803e-06, "epoch": 0.11764705882352941, "percentage": 1.18, "elapsed_time": "0:01:14", "remaining_time": "1:44:29"} -{"current_steps": 23, "total_steps": 1870, "loss": 0.8936, "lr": 4.998292650357558e-06, "epoch": 0.12299465240641712, "percentage": 1.23, "elapsed_time": "0:01:18", "remaining_time": "1:44:27"} -{"current_steps": 24, "total_steps": 1870, "loss": 1.2552, "lr": 4.998133928922773e-06, "epoch": 0.12834224598930483, "percentage": 1.28, "elapsed_time": "0:01:24", "remaining_time": "1:48:01"} -{"current_steps": 25, "total_steps": 1870, "loss": 0.7075, "lr": 4.99796815678942e-06, "epoch": 0.13368983957219252, "percentage": 1.34, "elapsed_time": "0:01:27", "remaining_time": "1:47:26"} -{"current_steps": 26, "total_steps": 1870, "loss": 0.9781, "lr": 4.997795334425372e-06, "epoch": 0.13903743315508021, "percentage": 1.39, "elapsed_time": "0:01:33", "remaining_time": "1:50:57"} -{"current_steps": 27, "total_steps": 1870, "loss": 1.0657, "lr": 4.997615462318403e-06, "epoch": 0.1443850267379679, "percentage": 1.44, "elapsed_time": "0:01:40", "remaining_time": "1:54:43"} -{"current_steps": 28, "total_steps": 1870, "loss": 0.951, "lr": 4.997428540976177e-06, "epoch": 0.1497326203208556, "percentage": 1.5, "elapsed_time": "0:01:45", "remaining_time": "1:55:17"} -{"current_steps": 29, "total_steps": 1870, "loss": 0.6788, "lr": 4.997234570926263e-06, "epoch": 0.15508021390374332, "percentage": 1.55, "elapsed_time": "0:01:48", "remaining_time": "1:54:49"} -{"current_steps": 30, "total_steps": 1870, "loss": 0.8, "lr": 4.997033552716116e-06, "epoch": 0.16042780748663102, "percentage": 1.6, "elapsed_time": "0:01:50", "remaining_time": "1:52:46"} -{"current_steps": 31, "total_steps": 1870, "loss": 0.7625, "lr": 4.9968254869130885e-06, "epoch": 0.1657754010695187, "percentage": 1.66, "elapsed_time": "0:01:52", "remaining_time": "1:51:09"} -{"current_steps": 32, "total_steps": 1870, "loss": 0.7381, "lr": 4.996610374104422e-06, "epoch": 0.1711229946524064, "percentage": 1.71, "elapsed_time": "0:01:53", "remaining_time": "1:48:19"} -{"current_steps": 33, "total_steps": 1870, "loss": 1.3283, "lr": 4.9963882148972475e-06, "epoch": 0.17647058823529413, "percentage": 1.76, "elapsed_time": "0:01:54", "remaining_time": "1:46:20"} -{"current_steps": 34, "total_steps": 1870, "loss": 1.0002, "lr": 4.996159009918586e-06, "epoch": 0.18181818181818182, "percentage": 1.82, "elapsed_time": "0:01:55", "remaining_time": "1:44:00"} -{"current_steps": 35, "total_steps": 1870, "loss": 0.9095, "lr": 4.9959227598153395e-06, "epoch": 0.18716577540106952, "percentage": 1.87, "elapsed_time": "0:01:59", "remaining_time": "1:44:51"} -{"current_steps": 36, "total_steps": 1870, "loss": 1.2191, "lr": 4.9956794652542994e-06, "epoch": 0.1925133689839572, "percentage": 1.93, "elapsed_time": "0:02:03", "remaining_time": "1:44:39"} -{"current_steps": 37, "total_steps": 1870, "loss": 0.7424, "lr": 4.9954291269221364e-06, "epoch": 0.19786096256684493, "percentage": 1.98, "elapsed_time": "0:02:06", "remaining_time": "1:44:04"} -{"current_steps": 38, "total_steps": 1870, "loss": 0.9289, "lr": 4.995171745525401e-06, "epoch": 0.20320855614973263, "percentage": 2.03, "elapsed_time": "0:02:09", "remaining_time": "1:44:06"} -{"current_steps": 39, "total_steps": 1870, "loss": 0.991, "lr": 4.994907321790524e-06, "epoch": 0.20855614973262032, "percentage": 2.09, "elapsed_time": "0:02:16", "remaining_time": "1:47:11"} -{"current_steps": 40, "total_steps": 1870, "loss": 0.6406, "lr": 4.994635856463811e-06, "epoch": 0.21390374331550802, "percentage": 2.14, "elapsed_time": "0:02:20", "remaining_time": "1:47:07"} -{"current_steps": 41, "total_steps": 1870, "loss": 1.2038, "lr": 4.994357350311441e-06, "epoch": 0.2192513368983957, "percentage": 2.19, "elapsed_time": "0:02:26", "remaining_time": "1:49:16"} -{"current_steps": 42, "total_steps": 1870, "loss": 0.9696, "lr": 4.994071804119467e-06, "epoch": 0.22459893048128343, "percentage": 2.25, "elapsed_time": "0:02:29", "remaining_time": "1:48:48"} -{"current_steps": 43, "total_steps": 1870, "loss": 1.1579, "lr": 4.993779218693811e-06, "epoch": 0.22994652406417113, "percentage": 2.3, "elapsed_time": "0:02:33", "remaining_time": "1:48:32"} -{"current_steps": 44, "total_steps": 1870, "loss": 0.7118, "lr": 4.99347959486026e-06, "epoch": 0.23529411764705882, "percentage": 2.35, "elapsed_time": "0:02:37", "remaining_time": "1:48:48"} -{"current_steps": 45, "total_steps": 1870, "loss": 0.7579, "lr": 4.99317293346447e-06, "epoch": 0.24064171122994651, "percentage": 2.41, "elapsed_time": "0:02:39", "remaining_time": "1:48:07"} -{"current_steps": 46, "total_steps": 1870, "loss": 0.7105, "lr": 4.992859235371958e-06, "epoch": 0.24598930481283424, "percentage": 2.46, "elapsed_time": "0:02:42", "remaining_time": "1:47:04"} -{"current_steps": 47, "total_steps": 1870, "loss": 0.6812, "lr": 4.992538501468101e-06, "epoch": 0.25133689839572193, "percentage": 2.51, "elapsed_time": "0:02:45", "remaining_time": "1:46:54"} -{"current_steps": 48, "total_steps": 1870, "loss": 0.9733, "lr": 4.992210732658132e-06, "epoch": 0.25668449197860965, "percentage": 2.57, "elapsed_time": "0:02:52", "remaining_time": "1:49:21"} -{"current_steps": 49, "total_steps": 1870, "loss": 1.1301, "lr": 4.991875929867143e-06, "epoch": 0.2620320855614973, "percentage": 2.62, "elapsed_time": "0:02:59", "remaining_time": "1:50:52"} -{"current_steps": 50, "total_steps": 1870, "loss": 0.6706, "lr": 4.991534094040077e-06, "epoch": 0.26737967914438504, "percentage": 2.67, "elapsed_time": "0:03:02", "remaining_time": "1:50:51"} -{"current_steps": 51, "total_steps": 1870, "loss": 0.9462, "lr": 4.991185226141726e-06, "epoch": 0.2727272727272727, "percentage": 2.73, "elapsed_time": "0:03:03", "remaining_time": "1:49:17"} -{"current_steps": 52, "total_steps": 1870, "loss": 1.0714, "lr": 4.990829327156729e-06, "epoch": 0.27807486631016043, "percentage": 2.78, "elapsed_time": "0:03:06", "remaining_time": "1:48:35"} -{"current_steps": 53, "total_steps": 1870, "loss": 0.9175, "lr": 4.990466398089571e-06, "epoch": 0.28342245989304815, "percentage": 2.83, "elapsed_time": "0:03:07", "remaining_time": "1:47:14"} -{"current_steps": 54, "total_steps": 1870, "loss": 0.5164, "lr": 4.99009643996458e-06, "epoch": 0.2887700534759358, "percentage": 2.89, "elapsed_time": "0:03:10", "remaining_time": "1:46:35"} -{"current_steps": 55, "total_steps": 1870, "loss": 0.7223, "lr": 4.989719453825918e-06, "epoch": 0.29411764705882354, "percentage": 2.94, "elapsed_time": "0:03:12", "remaining_time": "1:45:36"} -{"current_steps": 56, "total_steps": 1870, "loss": 0.7065, "lr": 4.989335440737587e-06, "epoch": 0.2994652406417112, "percentage": 2.99, "elapsed_time": "0:03:13", "remaining_time": "1:44:33"} -{"current_steps": 57, "total_steps": 1870, "loss": 0.8833, "lr": 4.9889444017834185e-06, "epoch": 0.3048128342245989, "percentage": 3.05, "elapsed_time": "0:03:15", "remaining_time": "1:43:41"} -{"current_steps": 58, "total_steps": 1870, "loss": 0.8664, "lr": 4.988546338067078e-06, "epoch": 0.31016042780748665, "percentage": 3.1, "elapsed_time": "0:03:18", "remaining_time": "1:43:15"} -{"current_steps": 59, "total_steps": 1870, "loss": 0.884, "lr": 4.988141250712053e-06, "epoch": 0.3155080213903743, "percentage": 3.16, "elapsed_time": "0:03:23", "remaining_time": "1:44:06"} -{"current_steps": 60, "total_steps": 1870, "loss": 0.9891, "lr": 4.987729140861657e-06, "epoch": 0.32085561497326204, "percentage": 3.21, "elapsed_time": "0:03:24", "remaining_time": "1:43:02"} -{"current_steps": 61, "total_steps": 1870, "loss": 0.8838, "lr": 4.987310009679023e-06, "epoch": 0.32620320855614976, "percentage": 3.26, "elapsed_time": "0:03:27", "remaining_time": "1:42:22"} -{"current_steps": 62, "total_steps": 1870, "loss": 0.8188, "lr": 4.986883858347101e-06, "epoch": 0.3315508021390374, "percentage": 3.32, "elapsed_time": "0:03:29", "remaining_time": "1:41:36"} -{"current_steps": 63, "total_steps": 1870, "loss": 0.6032, "lr": 4.986450688068655e-06, "epoch": 0.33689839572192515, "percentage": 3.37, "elapsed_time": "0:03:31", "remaining_time": "1:41:03"} -{"current_steps": 64, "total_steps": 1870, "loss": 0.7623, "lr": 4.986010500066258e-06, "epoch": 0.3422459893048128, "percentage": 3.42, "elapsed_time": "0:03:34", "remaining_time": "1:41:03"} -{"current_steps": 65, "total_steps": 1870, "loss": 0.8051, "lr": 4.985563295582292e-06, "epoch": 0.34759358288770054, "percentage": 3.48, "elapsed_time": "0:03:36", "remaining_time": "1:40:17"} -{"current_steps": 66, "total_steps": 1870, "loss": 0.7901, "lr": 4.98510907587894e-06, "epoch": 0.35294117647058826, "percentage": 3.53, "elapsed_time": "0:03:38", "remaining_time": "1:39:37"} -{"current_steps": 67, "total_steps": 1870, "loss": 1.0582, "lr": 4.984647842238185e-06, "epoch": 0.3582887700534759, "percentage": 3.58, "elapsed_time": "0:03:45", "remaining_time": "1:40:58"} -{"current_steps": 68, "total_steps": 1870, "loss": 0.5912, "lr": 4.984179595961806e-06, "epoch": 0.36363636363636365, "percentage": 3.64, "elapsed_time": "0:03:48", "remaining_time": "1:40:51"} -{"current_steps": 69, "total_steps": 1870, "loss": 0.7855, "lr": 4.983704338371375e-06, "epoch": 0.3689839572192513, "percentage": 3.69, "elapsed_time": "0:03:49", "remaining_time": "1:40:01"} -{"current_steps": 70, "total_steps": 1870, "loss": 0.6491, "lr": 4.983222070808255e-06, "epoch": 0.37433155080213903, "percentage": 3.74, "elapsed_time": "0:03:52", "remaining_time": "1:39:34"} -{"current_steps": 71, "total_steps": 1870, "loss": 0.9735, "lr": 4.982732794633588e-06, "epoch": 0.37967914438502676, "percentage": 3.8, "elapsed_time": "0:03:54", "remaining_time": "1:39:03"} -{"current_steps": 72, "total_steps": 1870, "loss": 0.8495, "lr": 4.982236511228301e-06, "epoch": 0.3850267379679144, "percentage": 3.85, "elapsed_time": "0:03:55", "remaining_time": "1:38:02"} -{"current_steps": 73, "total_steps": 1870, "loss": 1.0891, "lr": 4.981733221993099e-06, "epoch": 0.39037433155080214, "percentage": 3.9, "elapsed_time": "0:04:01", "remaining_time": "1:39:00"} -{"current_steps": 74, "total_steps": 1870, "loss": 0.8013, "lr": 4.981222928348456e-06, "epoch": 0.39572192513368987, "percentage": 3.96, "elapsed_time": "0:04:04", "remaining_time": "1:38:56"} -{"current_steps": 75, "total_steps": 1870, "loss": 0.8298, "lr": 4.98070563173462e-06, "epoch": 0.40106951871657753, "percentage": 4.01, "elapsed_time": "0:04:09", "remaining_time": "1:39:41"} -{"current_steps": 76, "total_steps": 1870, "loss": 0.6989, "lr": 4.980181333611601e-06, "epoch": 0.40641711229946526, "percentage": 4.06, "elapsed_time": "0:04:13", "remaining_time": "1:39:54"} -{"current_steps": 77, "total_steps": 1870, "loss": 0.6769, "lr": 4.979650035459171e-06, "epoch": 0.4117647058823529, "percentage": 4.12, "elapsed_time": "0:04:16", "remaining_time": "1:39:36"} -{"current_steps": 78, "total_steps": 1870, "loss": 1.0385, "lr": 4.9791117387768575e-06, "epoch": 0.41711229946524064, "percentage": 4.17, "elapsed_time": "0:04:20", "remaining_time": "1:39:34"} -{"current_steps": 79, "total_steps": 1870, "loss": 0.6498, "lr": 4.978566445083942e-06, "epoch": 0.42245989304812837, "percentage": 4.22, "elapsed_time": "0:04:22", "remaining_time": "1:39:10"} -{"current_steps": 80, "total_steps": 1870, "loss": 0.7931, "lr": 4.978014155919455e-06, "epoch": 0.42780748663101603, "percentage": 4.28, "elapsed_time": "0:04:23", "remaining_time": "1:38:13"} -{"current_steps": 81, "total_steps": 1870, "loss": 0.7322, "lr": 4.977454872842169e-06, "epoch": 0.43315508021390375, "percentage": 4.33, "elapsed_time": "0:04:26", "remaining_time": "1:37:58"} -{"current_steps": 82, "total_steps": 1870, "loss": 0.9184, "lr": 4.976888597430597e-06, "epoch": 0.4385026737967914, "percentage": 4.39, "elapsed_time": "0:04:30", "remaining_time": "1:38:16"} -{"current_steps": 83, "total_steps": 1870, "loss": 0.8258, "lr": 4.976315331282985e-06, "epoch": 0.44385026737967914, "percentage": 4.44, "elapsed_time": "0:04:34", "remaining_time": "1:38:29"} -{"current_steps": 84, "total_steps": 1870, "loss": 0.7414, "lr": 4.9757350760173144e-06, "epoch": 0.44919786096256686, "percentage": 4.49, "elapsed_time": "0:04:37", "remaining_time": "1:38:21"} -{"current_steps": 85, "total_steps": 1870, "loss": 0.8573, "lr": 4.975147833271288e-06, "epoch": 0.45454545454545453, "percentage": 4.55, "elapsed_time": "0:04:41", "remaining_time": "1:38:31"} -{"current_steps": 86, "total_steps": 1870, "loss": 0.7271, "lr": 4.974553604702332e-06, "epoch": 0.45989304812834225, "percentage": 4.6, "elapsed_time": "0:04:44", "remaining_time": "1:38:23"} -{"current_steps": 87, "total_steps": 1870, "loss": 0.8976, "lr": 4.973952391987589e-06, "epoch": 0.46524064171123, "percentage": 4.65, "elapsed_time": "0:04:45", "remaining_time": "1:37:36"} -{"current_steps": 88, "total_steps": 1870, "loss": 1.0753, "lr": 4.9733441968239125e-06, "epoch": 0.47058823529411764, "percentage": 4.71, "elapsed_time": "0:04:47", "remaining_time": "1:37:08"} -{"current_steps": 89, "total_steps": 1870, "loss": 0.6903, "lr": 4.972729020927866e-06, "epoch": 0.47593582887700536, "percentage": 4.76, "elapsed_time": "0:04:50", "remaining_time": "1:36:45"} -{"current_steps": 90, "total_steps": 1870, "loss": 0.9347, "lr": 4.97210686603571e-06, "epoch": 0.48128342245989303, "percentage": 4.81, "elapsed_time": "0:04:51", "remaining_time": "1:36:10"} -{"current_steps": 91, "total_steps": 1870, "loss": 0.6738, "lr": 4.97147773390341e-06, "epoch": 0.48663101604278075, "percentage": 4.87, "elapsed_time": "0:04:55", "remaining_time": "1:36:12"} -{"current_steps": 92, "total_steps": 1870, "loss": 0.8356, "lr": 4.970841626306617e-06, "epoch": 0.4919786096256685, "percentage": 4.92, "elapsed_time": "0:05:01", "remaining_time": "1:37:03"} -{"current_steps": 93, "total_steps": 1870, "loss": 0.9117, "lr": 4.970198545040673e-06, "epoch": 0.49732620320855614, "percentage": 4.97, "elapsed_time": "0:05:07", "remaining_time": "1:38:03"} -{"current_steps": 94, "total_steps": 1870, "loss": 0.8237, "lr": 4.969548491920603e-06, "epoch": 0.5026737967914439, "percentage": 5.03, "elapsed_time": "0:05:09", "remaining_time": "1:37:35"} -{"current_steps": 95, "total_steps": 1870, "loss": 0.8775, "lr": 4.968891468781105e-06, "epoch": 0.5080213903743316, "percentage": 5.08, "elapsed_time": "0:05:13", "remaining_time": "1:37:35"} -{"current_steps": 96, "total_steps": 1870, "loss": 0.9068, "lr": 4.968227477476554e-06, "epoch": 0.5133689839572193, "percentage": 5.13, "elapsed_time": "0:05:14", "remaining_time": "1:36:51"} -{"current_steps": 97, "total_steps": 1870, "loss": 1.0435, "lr": 4.9675565198809905e-06, "epoch": 0.5187165775401069, "percentage": 5.19, "elapsed_time": "0:05:19", "remaining_time": "1:37:15"} -{"current_steps": 98, "total_steps": 1870, "loss": 1.0084, "lr": 4.966878597888114e-06, "epoch": 0.5240641711229946, "percentage": 5.24, "elapsed_time": "0:05:22", "remaining_time": "1:37:12"} -{"current_steps": 99, "total_steps": 1870, "loss": 0.7217, "lr": 4.966193713411284e-06, "epoch": 0.5294117647058824, "percentage": 5.29, "elapsed_time": "0:05:24", "remaining_time": "1:36:44"} -{"current_steps": 100, "total_steps": 1870, "loss": 0.6594, "lr": 4.965501868383507e-06, "epoch": 0.5347593582887701, "percentage": 5.35, "elapsed_time": "0:05:28", "remaining_time": "1:36:51"} -{"current_steps": 101, "total_steps": 1870, "loss": 0.9249, "lr": 4.964803064757438e-06, "epoch": 0.5401069518716578, "percentage": 5.4, "elapsed_time": "0:05:35", "remaining_time": "1:37:54"} -{"current_steps": 102, "total_steps": 1870, "loss": 0.7776, "lr": 4.964097304505371e-06, "epoch": 0.5454545454545454, "percentage": 5.45, "elapsed_time": "0:05:38", "remaining_time": "1:37:51"} -{"current_steps": 103, "total_steps": 1870, "loss": 0.6339, "lr": 4.963384589619233e-06, "epoch": 0.5508021390374331, "percentage": 5.51, "elapsed_time": "0:05:41", "remaining_time": "1:37:32"} -{"current_steps": 104, "total_steps": 1870, "loss": 1.0107, "lr": 4.962664922110581e-06, "epoch": 0.5561497326203209, "percentage": 5.56, "elapsed_time": "0:05:42", "remaining_time": "1:36:47"} -{"current_steps": 105, "total_steps": 1870, "loss": 1.0052, "lr": 4.9619383040105954e-06, "epoch": 0.5614973262032086, "percentage": 5.61, "elapsed_time": "0:05:43", "remaining_time": "1:36:14"} -{"current_steps": 106, "total_steps": 1870, "loss": 0.8577, "lr": 4.961204737370071e-06, "epoch": 0.5668449197860963, "percentage": 5.67, "elapsed_time": "0:05:47", "remaining_time": "1:36:24"} -{"current_steps": 107, "total_steps": 1870, "loss": 1.1237, "lr": 4.960464224259418e-06, "epoch": 0.5721925133689839, "percentage": 5.72, "elapsed_time": "0:05:50", "remaining_time": "1:36:17"} -{"current_steps": 108, "total_steps": 1870, "loss": 0.6237, "lr": 4.95971676676865e-06, "epoch": 0.5775401069518716, "percentage": 5.78, "elapsed_time": "0:05:52", "remaining_time": "1:35:57"} -{"current_steps": 109, "total_steps": 1870, "loss": 0.9135, "lr": 4.958962367007381e-06, "epoch": 0.5828877005347594, "percentage": 5.83, "elapsed_time": "0:05:54", "remaining_time": "1:35:28"} -{"current_steps": 110, "total_steps": 1870, "loss": 0.7461, "lr": 4.958201027104818e-06, "epoch": 0.5882352941176471, "percentage": 5.88, "elapsed_time": "0:05:57", "remaining_time": "1:35:19"} -{"current_steps": 111, "total_steps": 1870, "loss": 0.69, "lr": 4.957432749209755e-06, "epoch": 0.5935828877005348, "percentage": 5.94, "elapsed_time": "0:06:03", "remaining_time": "1:36:00"} -{"current_steps": 112, "total_steps": 1870, "loss": 0.8578, "lr": 4.95665753549057e-06, "epoch": 0.5989304812834224, "percentage": 5.99, "elapsed_time": "0:06:07", "remaining_time": "1:36:14"} -{"current_steps": 113, "total_steps": 1870, "loss": 1.3098, "lr": 4.9558753881352165e-06, "epoch": 0.6042780748663101, "percentage": 6.04, "elapsed_time": "0:06:10", "remaining_time": "1:35:58"} -{"current_steps": 114, "total_steps": 1870, "loss": 0.9979, "lr": 4.955086309351213e-06, "epoch": 0.6096256684491979, "percentage": 6.1, "elapsed_time": "0:06:13", "remaining_time": "1:35:46"} -{"current_steps": 115, "total_steps": 1870, "loss": 0.6298, "lr": 4.9542903013656485e-06, "epoch": 0.6149732620320856, "percentage": 6.15, "elapsed_time": "0:06:14", "remaining_time": "1:35:15"} -{"current_steps": 116, "total_steps": 1870, "loss": 0.959, "lr": 4.953487366425163e-06, "epoch": 0.6203208556149733, "percentage": 6.2, "elapsed_time": "0:06:18", "remaining_time": "1:35:27"} -{"current_steps": 117, "total_steps": 1870, "loss": 0.6791, "lr": 4.952677506795949e-06, "epoch": 0.6256684491978609, "percentage": 6.26, "elapsed_time": "0:06:20", "remaining_time": "1:34:58"} -{"current_steps": 118, "total_steps": 1870, "loss": 0.7783, "lr": 4.951860724763743e-06, "epoch": 0.6310160427807486, "percentage": 6.31, "elapsed_time": "0:06:23", "remaining_time": "1:34:49"} -{"current_steps": 119, "total_steps": 1870, "loss": 0.8085, "lr": 4.95103702263382e-06, "epoch": 0.6363636363636364, "percentage": 6.36, "elapsed_time": "0:06:25", "remaining_time": "1:34:28"} -{"current_steps": 120, "total_steps": 1870, "loss": 0.7702, "lr": 4.950206402730984e-06, "epoch": 0.6417112299465241, "percentage": 6.42, "elapsed_time": "0:06:28", "remaining_time": "1:34:32"} -{"current_steps": 121, "total_steps": 1870, "loss": 0.602, "lr": 4.949368867399567e-06, "epoch": 0.6470588235294118, "percentage": 6.47, "elapsed_time": "0:06:30", "remaining_time": "1:33:59"} -{"current_steps": 122, "total_steps": 1870, "loss": 1.2858, "lr": 4.948524419003415e-06, "epoch": 0.6524064171122995, "percentage": 6.52, "elapsed_time": "0:06:32", "remaining_time": "1:33:49"} -{"current_steps": 123, "total_steps": 1870, "loss": 0.7945, "lr": 4.947673059925889e-06, "epoch": 0.6577540106951871, "percentage": 6.58, "elapsed_time": "0:06:38", "remaining_time": "1:34:17"} -{"current_steps": 124, "total_steps": 1870, "loss": 0.959, "lr": 4.9468147925698525e-06, "epoch": 0.6631016042780749, "percentage": 6.63, "elapsed_time": "0:06:41", "remaining_time": "1:34:14"} -{"current_steps": 125, "total_steps": 1870, "loss": 0.7611, "lr": 4.945949619357668e-06, "epoch": 0.6684491978609626, "percentage": 6.68, "elapsed_time": "0:06:42", "remaining_time": "1:33:41"} -{"current_steps": 126, "total_steps": 1870, "loss": 0.5753, "lr": 4.945077542731188e-06, "epoch": 0.6737967914438503, "percentage": 6.74, "elapsed_time": "0:06:44", "remaining_time": "1:33:16"} -{"current_steps": 127, "total_steps": 1870, "loss": 0.8995, "lr": 4.94419856515175e-06, "epoch": 0.679144385026738, "percentage": 6.79, "elapsed_time": "0:06:49", "remaining_time": "1:33:43"} -{"current_steps": 128, "total_steps": 1870, "loss": 0.9623, "lr": 4.943312689100166e-06, "epoch": 0.6844919786096256, "percentage": 6.84, "elapsed_time": "0:06:51", "remaining_time": "1:33:20"} -{"current_steps": 129, "total_steps": 1870, "loss": 0.6657, "lr": 4.942419917076723e-06, "epoch": 0.6898395721925134, "percentage": 6.9, "elapsed_time": "0:06:55", "remaining_time": "1:33:21"} -{"current_steps": 130, "total_steps": 1870, "loss": 0.7711, "lr": 4.941520251601167e-06, "epoch": 0.6951871657754011, "percentage": 6.95, "elapsed_time": "0:06:57", "remaining_time": "1:33:05"} -{"current_steps": 131, "total_steps": 1870, "loss": 0.5908, "lr": 4.940613695212702e-06, "epoch": 0.7005347593582888, "percentage": 7.01, "elapsed_time": "0:06:58", "remaining_time": "1:32:34"} -{"current_steps": 132, "total_steps": 1870, "loss": 0.967, "lr": 4.939700250469979e-06, "epoch": 0.7058823529411765, "percentage": 7.06, "elapsed_time": "0:07:02", "remaining_time": "1:32:46"} -{"current_steps": 133, "total_steps": 1870, "loss": 0.9519, "lr": 4.938779919951092e-06, "epoch": 0.7112299465240641, "percentage": 7.11, "elapsed_time": "0:07:04", "remaining_time": "1:32:18"} -{"current_steps": 134, "total_steps": 1870, "loss": 0.5873, "lr": 4.93785270625357e-06, "epoch": 0.7165775401069518, "percentage": 7.17, "elapsed_time": "0:07:05", "remaining_time": "1:31:54"} -{"current_steps": 135, "total_steps": 1870, "loss": 0.8148, "lr": 4.936918611994368e-06, "epoch": 0.7219251336898396, "percentage": 7.22, "elapsed_time": "0:07:12", "remaining_time": "1:32:34"} -{"current_steps": 136, "total_steps": 1870, "loss": 0.8286, "lr": 4.935977639809861e-06, "epoch": 0.7272727272727273, "percentage": 7.27, "elapsed_time": "0:07:15", "remaining_time": "1:32:27"} -{"current_steps": 137, "total_steps": 1870, "loss": 0.6442, "lr": 4.935029792355834e-06, "epoch": 0.732620320855615, "percentage": 7.33, "elapsed_time": "0:07:17", "remaining_time": "1:32:13"} -{"current_steps": 138, "total_steps": 1870, "loss": 1.0144, "lr": 4.934075072307481e-06, "epoch": 0.7379679144385026, "percentage": 7.38, "elapsed_time": "0:07:20", "remaining_time": "1:32:07"} -{"current_steps": 139, "total_steps": 1870, "loss": 0.5922, "lr": 4.933113482359388e-06, "epoch": 0.7433155080213903, "percentage": 7.43, "elapsed_time": "0:07:22", "remaining_time": "1:31:45"} -{"current_steps": 140, "total_steps": 1870, "loss": 0.7546, "lr": 4.932145025225535e-06, "epoch": 0.7486631016042781, "percentage": 7.49, "elapsed_time": "0:07:23", "remaining_time": "1:31:17"} -{"current_steps": 141, "total_steps": 1870, "loss": 0.8797, "lr": 4.931169703639282e-06, "epoch": 0.7540106951871658, "percentage": 7.54, "elapsed_time": "0:07:25", "remaining_time": "1:31:02"} -{"current_steps": 142, "total_steps": 1870, "loss": 0.865, "lr": 4.930187520353363e-06, "epoch": 0.7593582887700535, "percentage": 7.59, "elapsed_time": "0:07:28", "remaining_time": "1:31:00"} -{"current_steps": 143, "total_steps": 1870, "loss": 0.6901, "lr": 4.929198478139877e-06, "epoch": 0.7647058823529411, "percentage": 7.65, "elapsed_time": "0:07:31", "remaining_time": "1:30:47"} -{"current_steps": 144, "total_steps": 1870, "loss": 0.5932, "lr": 4.928202579790285e-06, "epoch": 0.7700534759358288, "percentage": 7.7, "elapsed_time": "0:07:34", "remaining_time": "1:30:48"} -{"current_steps": 145, "total_steps": 1870, "loss": 0.7742, "lr": 4.927199828115395e-06, "epoch": 0.7754010695187166, "percentage": 7.75, "elapsed_time": "0:07:36", "remaining_time": "1:30:25"} -{"current_steps": 146, "total_steps": 1870, "loss": 0.8475, "lr": 4.9261902259453616e-06, "epoch": 0.7807486631016043, "percentage": 7.81, "elapsed_time": "0:07:39", "remaining_time": "1:30:29"} -{"current_steps": 147, "total_steps": 1870, "loss": 1.0514, "lr": 4.925173776129669e-06, "epoch": 0.786096256684492, "percentage": 7.86, "elapsed_time": "0:07:45", "remaining_time": "1:30:56"} -{"current_steps": 148, "total_steps": 1870, "loss": 0.5964, "lr": 4.9241504815371346e-06, "epoch": 0.7914438502673797, "percentage": 7.91, "elapsed_time": "0:07:46", "remaining_time": "1:30:29"} -{"current_steps": 149, "total_steps": 1870, "loss": 0.7615, "lr": 4.923120345055887e-06, "epoch": 0.7967914438502673, "percentage": 7.97, "elapsed_time": "0:07:49", "remaining_time": "1:30:28"} -{"current_steps": 150, "total_steps": 1870, "loss": 0.6908, "lr": 4.922083369593372e-06, "epoch": 0.8021390374331551, "percentage": 8.02, "elapsed_time": "0:07:51", "remaining_time": "1:30:09"} -{"current_steps": 151, "total_steps": 1870, "loss": 0.8661, "lr": 4.921039558076335e-06, "epoch": 0.8074866310160428, "percentage": 8.07, "elapsed_time": "0:07:56", "remaining_time": "1:30:24"} -{"current_steps": 152, "total_steps": 1870, "loss": 0.5267, "lr": 4.919988913450812e-06, "epoch": 0.8128342245989305, "percentage": 8.13, "elapsed_time": "0:07:59", "remaining_time": "1:30:18"} -{"current_steps": 153, "total_steps": 1870, "loss": 0.9222, "lr": 4.918931438682132e-06, "epoch": 0.8181818181818182, "percentage": 8.18, "elapsed_time": "0:08:02", "remaining_time": "1:30:16"} -{"current_steps": 154, "total_steps": 1870, "loss": 0.8865, "lr": 4.917867136754894e-06, "epoch": 0.8235294117647058, "percentage": 8.24, "elapsed_time": "0:08:04", "remaining_time": "1:29:55"} -{"current_steps": 155, "total_steps": 1870, "loss": 0.7262, "lr": 4.916796010672969e-06, "epoch": 0.8288770053475936, "percentage": 8.29, "elapsed_time": "0:08:07", "remaining_time": "1:29:57"} -{"current_steps": 156, "total_steps": 1870, "loss": 0.7611, "lr": 4.91571806345949e-06, "epoch": 0.8342245989304813, "percentage": 8.34, "elapsed_time": "0:08:10", "remaining_time": "1:29:44"} -{"current_steps": 157, "total_steps": 1870, "loss": 0.8745, "lr": 4.91463329815684e-06, "epoch": 0.839572192513369, "percentage": 8.4, "elapsed_time": "0:08:11", "remaining_time": "1:29:26"} -{"current_steps": 158, "total_steps": 1870, "loss": 0.6164, "lr": 4.913541717826645e-06, "epoch": 0.8449197860962567, "percentage": 8.45, "elapsed_time": "0:08:14", "remaining_time": "1:29:19"} -{"current_steps": 159, "total_steps": 1870, "loss": 0.5549, "lr": 4.912443325549767e-06, "epoch": 0.8502673796791443, "percentage": 8.5, "elapsed_time": "0:08:18", "remaining_time": "1:29:25"} -{"current_steps": 160, "total_steps": 1870, "loss": 0.9052, "lr": 4.911338124426291e-06, "epoch": 0.8556149732620321, "percentage": 8.56, "elapsed_time": "0:08:21", "remaining_time": "1:29:23"} -{"current_steps": 161, "total_steps": 1870, "loss": 0.7989, "lr": 4.910226117575525e-06, "epoch": 0.8609625668449198, "percentage": 8.61, "elapsed_time": "0:08:26", "remaining_time": "1:29:35"} -{"current_steps": 162, "total_steps": 1870, "loss": 0.5915, "lr": 4.909107308135978e-06, "epoch": 0.8663101604278075, "percentage": 8.66, "elapsed_time": "0:08:28", "remaining_time": "1:29:22"} -{"current_steps": 163, "total_steps": 1870, "loss": 0.6593, "lr": 4.907981699265364e-06, "epoch": 0.8716577540106952, "percentage": 8.72, "elapsed_time": "0:08:30", "remaining_time": "1:29:02"} -{"current_steps": 164, "total_steps": 1870, "loss": 0.8739, "lr": 4.906849294140587e-06, "epoch": 0.8770053475935828, "percentage": 8.77, "elapsed_time": "0:08:33", "remaining_time": "1:29:00"} -{"current_steps": 165, "total_steps": 1870, "loss": 0.7314, "lr": 4.9057100959577285e-06, "epoch": 0.8823529411764706, "percentage": 8.82, "elapsed_time": "0:08:37", "remaining_time": "1:29:12"} -{"current_steps": 166, "total_steps": 1870, "loss": 1.0109, "lr": 4.904564107932048e-06, "epoch": 0.8877005347593583, "percentage": 8.88, "elapsed_time": "0:08:40", "remaining_time": "1:28:59"} -{"current_steps": 167, "total_steps": 1870, "loss": 0.9092, "lr": 4.903411333297966e-06, "epoch": 0.893048128342246, "percentage": 8.93, "elapsed_time": "0:08:43", "remaining_time": "1:28:54"} -{"current_steps": 168, "total_steps": 1870, "loss": 0.7922, "lr": 4.902251775309057e-06, "epoch": 0.8983957219251337, "percentage": 8.98, "elapsed_time": "0:08:46", "remaining_time": "1:28:53"} -{"current_steps": 169, "total_steps": 1870, "loss": 0.5955, "lr": 4.901085437238041e-06, "epoch": 0.9037433155080213, "percentage": 9.04, "elapsed_time": "0:08:52", "remaining_time": "1:29:17"} -{"current_steps": 170, "total_steps": 1870, "loss": 1.0019, "lr": 4.899912322376776e-06, "epoch": 0.9090909090909091, "percentage": 9.09, "elapsed_time": "0:08:54", "remaining_time": "1:29:01"} -{"current_steps": 171, "total_steps": 1870, "loss": 0.8508, "lr": 4.8987324340362445e-06, "epoch": 0.9144385026737968, "percentage": 9.14, "elapsed_time": "0:08:58", "remaining_time": "1:29:10"} -{"current_steps": 172, "total_steps": 1870, "loss": 0.8514, "lr": 4.897545775546545e-06, "epoch": 0.9197860962566845, "percentage": 9.2, "elapsed_time": "0:08:59", "remaining_time": "1:28:48"} -{"current_steps": 173, "total_steps": 1870, "loss": 1.0263, "lr": 4.8963523502568886e-06, "epoch": 0.9251336898395722, "percentage": 9.25, "elapsed_time": "0:09:03", "remaining_time": "1:28:46"} -{"current_steps": 174, "total_steps": 1870, "loss": 0.7929, "lr": 4.895152161535582e-06, "epoch": 0.93048128342246, "percentage": 9.3, "elapsed_time": "0:09:04", "remaining_time": "1:28:30"} -{"current_steps": 175, "total_steps": 1870, "loss": 0.7227, "lr": 4.893945212770019e-06, "epoch": 0.9358288770053476, "percentage": 9.36, "elapsed_time": "0:09:08", "remaining_time": "1:28:30"} -{"current_steps": 176, "total_steps": 1870, "loss": 0.8923, "lr": 4.892731507366678e-06, "epoch": 0.9411764705882353, "percentage": 9.41, "elapsed_time": "0:09:12", "remaining_time": "1:28:34"} -{"current_steps": 177, "total_steps": 1870, "loss": 0.7475, "lr": 4.891511048751102e-06, "epoch": 0.946524064171123, "percentage": 9.47, "elapsed_time": "0:09:15", "remaining_time": "1:28:30"} -{"current_steps": 178, "total_steps": 1870, "loss": 1.1405, "lr": 4.890283840367898e-06, "epoch": 0.9518716577540107, "percentage": 9.52, "elapsed_time": "0:09:20", "remaining_time": "1:28:51"} -{"current_steps": 179, "total_steps": 1870, "loss": 0.8524, "lr": 4.889049885680721e-06, "epoch": 0.9572192513368984, "percentage": 9.57, "elapsed_time": "0:09:24", "remaining_time": "1:28:49"} -{"current_steps": 180, "total_steps": 1870, "loss": 0.7617, "lr": 4.887809188172268e-06, "epoch": 0.9625668449197861, "percentage": 9.63, "elapsed_time": "0:09:25", "remaining_time": "1:28:32"} -{"current_steps": 181, "total_steps": 1870, "loss": 0.8514, "lr": 4.886561751344266e-06, "epoch": 0.9679144385026738, "percentage": 9.68, "elapsed_time": "0:09:27", "remaining_time": "1:28:18"} -{"current_steps": 182, "total_steps": 1870, "loss": 0.8335, "lr": 4.885307578717464e-06, "epoch": 0.9732620320855615, "percentage": 9.73, "elapsed_time": "0:09:33", "remaining_time": "1:28:38"} -{"current_steps": 183, "total_steps": 1870, "loss": 0.831, "lr": 4.8840466738316216e-06, "epoch": 0.9786096256684492, "percentage": 9.79, "elapsed_time": "0:09:37", "remaining_time": "1:28:43"} -{"current_steps": 184, "total_steps": 1870, "loss": 0.7891, "lr": 4.882779040245499e-06, "epoch": 0.983957219251337, "percentage": 9.84, "elapsed_time": "0:09:39", "remaining_time": "1:28:28"} -{"current_steps": 185, "total_steps": 1870, "loss": 0.6257, "lr": 4.881504681536847e-06, "epoch": 0.9893048128342246, "percentage": 9.89, "elapsed_time": "0:09:44", "remaining_time": "1:28:40"} -{"current_steps": 186, "total_steps": 1870, "loss": 0.6008, "lr": 4.880223601302398e-06, "epoch": 0.9946524064171123, "percentage": 9.95, "elapsed_time": "0:09:48", "remaining_time": "1:28:50"} -{"current_steps": 187, "total_steps": 1870, "loss": 0.6061, "lr": 4.878935803157856e-06, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "0:09:50", "remaining_time": "1:28:35"} -{"current_steps": 188, "total_steps": 1870, "loss": 0.6628, "lr": 4.8776412907378845e-06, "epoch": 1.0053475935828877, "percentage": 10.05, "elapsed_time": "1:48:13", "remaining_time": "16:08:15"} -{"current_steps": 189, "total_steps": 1870, "loss": 0.6124, "lr": 4.876340067696097e-06, "epoch": 1.0106951871657754, "percentage": 10.11, "elapsed_time": "1:48:19", "remaining_time": "16:03:27"} -{"current_steps": 190, "total_steps": 1870, "loss": 0.7186, "lr": 4.875032137705047e-06, "epoch": 1.0160427807486632, "percentage": 10.16, "elapsed_time": "1:48:22", "remaining_time": "15:58:17"} -{"current_steps": 191, "total_steps": 1870, "loss": 0.7723, "lr": 4.873717504456219e-06, "epoch": 1.0213903743315509, "percentage": 10.21, "elapsed_time": "1:48:28", "remaining_time": "15:53:34"} -{"current_steps": 192, "total_steps": 1870, "loss": 0.4069, "lr": 4.872396171660014e-06, "epoch": 1.0267379679144386, "percentage": 10.27, "elapsed_time": "1:48:31", "remaining_time": "15:48:26"} -{"current_steps": 193, "total_steps": 1870, "loss": 0.6705, "lr": 4.8710681430457466e-06, "epoch": 1.032085561497326, "percentage": 10.32, "elapsed_time": "1:48:37", "remaining_time": "15:43:47"} -{"current_steps": 194, "total_steps": 1870, "loss": 0.8276, "lr": 4.8697334223616226e-06, "epoch": 1.0374331550802138, "percentage": 10.37, "elapsed_time": "1:48:41", "remaining_time": "15:38:56"} -{"current_steps": 195, "total_steps": 1870, "loss": 0.5913, "lr": 4.8683920133747405e-06, "epoch": 1.0427807486631016, "percentage": 10.43, "elapsed_time": "1:48:43", "remaining_time": "15:33:54"} -{"current_steps": 196, "total_steps": 1870, "loss": 0.5244, "lr": 4.867043919871076e-06, "epoch": 1.0481283422459893, "percentage": 10.48, "elapsed_time": "1:48:45", "remaining_time": "15:28:55"} -{"current_steps": 197, "total_steps": 1870, "loss": 0.5962, "lr": 4.865689145655467e-06, "epoch": 1.053475935828877, "percentage": 10.53, "elapsed_time": "1:48:51", "remaining_time": "15:24:28"} -{"current_steps": 198, "total_steps": 1870, "loss": 0.5601, "lr": 4.864327694551612e-06, "epoch": 1.0588235294117647, "percentage": 10.59, "elapsed_time": "1:48:53", "remaining_time": "15:19:33"} -{"current_steps": 199, "total_steps": 1870, "loss": 0.953, "lr": 4.86295957040205e-06, "epoch": 1.0641711229946524, "percentage": 10.64, "elapsed_time": "1:48:55", "remaining_time": "15:14:37"} -{"current_steps": 200, "total_steps": 1870, "loss": 0.7394, "lr": 4.861584777068154e-06, "epoch": 1.0695187165775402, "percentage": 10.7, "elapsed_time": "1:49:01", "remaining_time": "15:10:24"} -{"current_steps": 201, "total_steps": 1870, "loss": 0.3851, "lr": 4.860203318430126e-06, "epoch": 1.0748663101604279, "percentage": 10.75, "elapsed_time": "1:49:03", "remaining_time": "15:05:30"} -{"current_steps": 202, "total_steps": 1870, "loss": 0.6595, "lr": 4.858815198386973e-06, "epoch": 1.0802139037433156, "percentage": 10.8, "elapsed_time": "1:49:05", "remaining_time": "15:00:46"} -{"current_steps": 203, "total_steps": 1870, "loss": 0.5308, "lr": 4.8574204208565056e-06, "epoch": 1.085561497326203, "percentage": 10.86, "elapsed_time": "1:49:07", "remaining_time": "14:56:03"} -{"current_steps": 204, "total_steps": 1870, "loss": 0.5401, "lr": 4.856018989775326e-06, "epoch": 1.0909090909090908, "percentage": 10.91, "elapsed_time": "1:49:10", "remaining_time": "14:51:38"} -{"current_steps": 205, "total_steps": 1870, "loss": 0.6365, "lr": 4.854610909098813e-06, "epoch": 1.0962566844919786, "percentage": 10.96, "elapsed_time": "1:49:12", "remaining_time": "14:47:02"} -{"current_steps": 206, "total_steps": 1870, "loss": 1.0949, "lr": 4.853196182801112e-06, "epoch": 1.1016042780748663, "percentage": 11.02, "elapsed_time": "1:49:14", "remaining_time": "14:42:28"} -{"current_steps": 207, "total_steps": 1870, "loss": 0.6629, "lr": 4.851774814875131e-06, "epoch": 1.106951871657754, "percentage": 11.07, "elapsed_time": "1:49:21", "remaining_time": "14:38:32"} -{"current_steps": 208, "total_steps": 1870, "loss": 0.7166, "lr": 4.850346809332515e-06, "epoch": 1.1122994652406417, "percentage": 11.12, "elapsed_time": "1:49:23", "remaining_time": "14:34:03"} -{"current_steps": 209, "total_steps": 1870, "loss": 0.7077, "lr": 4.8489121702036515e-06, "epoch": 1.1176470588235294, "percentage": 11.18, "elapsed_time": "1:49:28", "remaining_time": "14:30:02"} -{"current_steps": 210, "total_steps": 1870, "loss": 0.6319, "lr": 4.847470901537642e-06, "epoch": 1.1229946524064172, "percentage": 11.23, "elapsed_time": "1:49:34", "remaining_time": "14:26:13"} -{"current_steps": 211, "total_steps": 1870, "loss": 0.5327, "lr": 4.846023007402305e-06, "epoch": 1.1283422459893049, "percentage": 11.28, "elapsed_time": "1:49:36", "remaining_time": "14:21:49"} -{"current_steps": 212, "total_steps": 1870, "loss": 0.414, "lr": 4.844568491884156e-06, "epoch": 1.1336898395721926, "percentage": 11.34, "elapsed_time": "1:49:38", "remaining_time": "14:17:24"} -{"current_steps": 213, "total_steps": 1870, "loss": 0.5933, "lr": 4.843107359088402e-06, "epoch": 1.1390374331550803, "percentage": 11.39, "elapsed_time": "1:49:40", "remaining_time": "14:13:09"} -{"current_steps": 214, "total_steps": 1870, "loss": 0.6844, "lr": 4.84163961313892e-06, "epoch": 1.1443850267379678, "percentage": 11.44, "elapsed_time": "1:49:44", "remaining_time": "14:09:13"} -{"current_steps": 215, "total_steps": 1870, "loss": 0.5242, "lr": 4.840165258178259e-06, "epoch": 1.1497326203208555, "percentage": 11.5, "elapsed_time": "1:49:46", "remaining_time": "14:05:03"} -{"current_steps": 216, "total_steps": 1870, "loss": 0.747, "lr": 4.838684298367616e-06, "epoch": 1.1550802139037433, "percentage": 11.55, "elapsed_time": "1:49:48", "remaining_time": "14:00:50"} -{"current_steps": 217, "total_steps": 1870, "loss": 0.7602, "lr": 4.837196737886834e-06, "epoch": 1.160427807486631, "percentage": 11.6, "elapsed_time": "1:49:51", "remaining_time": "13:56:47"} -{"current_steps": 218, "total_steps": 1870, "loss": 0.7525, "lr": 4.83570258093438e-06, "epoch": 1.1657754010695187, "percentage": 11.66, "elapsed_time": "1:49:55", "remaining_time": "13:53:00"} -{"current_steps": 219, "total_steps": 1870, "loss": 0.5111, "lr": 4.834201831727343e-06, "epoch": 1.1711229946524064, "percentage": 11.71, "elapsed_time": "1:50:02", "remaining_time": "13:49:36"} -{"current_steps": 220, "total_steps": 1870, "loss": 0.6215, "lr": 4.832694494501417e-06, "epoch": 1.1764705882352942, "percentage": 11.76, "elapsed_time": "1:50:04", "remaining_time": "13:45:32"} -{"current_steps": 221, "total_steps": 1870, "loss": 0.3931, "lr": 4.83118057351089e-06, "epoch": 1.1818181818181819, "percentage": 11.82, "elapsed_time": "1:50:05", "remaining_time": "13:41:28"} -{"current_steps": 222, "total_steps": 1870, "loss": 0.6418, "lr": 4.829660073028631e-06, "epoch": 1.1871657754010696, "percentage": 11.87, "elapsed_time": "1:50:09", "remaining_time": "13:37:41"} -{"current_steps": 223, "total_steps": 1870, "loss": 0.5028, "lr": 4.82813299734608e-06, "epoch": 1.192513368983957, "percentage": 11.93, "elapsed_time": "1:50:11", "remaining_time": "13:33:52"} -{"current_steps": 224, "total_steps": 1870, "loss": 0.4452, "lr": 4.826599350773234e-06, "epoch": 1.1978609625668448, "percentage": 11.98, "elapsed_time": "1:50:14", "remaining_time": "13:30:01"} -{"current_steps": 225, "total_steps": 1870, "loss": 0.7803, "lr": 4.825059137638636e-06, "epoch": 1.2032085561497325, "percentage": 12.03, "elapsed_time": "1:50:17", "remaining_time": "13:26:24"} -{"current_steps": 226, "total_steps": 1870, "loss": 0.5968, "lr": 4.823512362289362e-06, "epoch": 1.2085561497326203, "percentage": 12.09, "elapsed_time": "1:50:20", "remaining_time": "13:22:36"} -{"current_steps": 227, "total_steps": 1870, "loss": 0.5724, "lr": 4.821959029091009e-06, "epoch": 1.213903743315508, "percentage": 12.14, "elapsed_time": "1:50:25", "remaining_time": "13:19:13"} -{"current_steps": 228, "total_steps": 1870, "loss": 0.5357, "lr": 4.820399142427684e-06, "epoch": 1.2192513368983957, "percentage": 12.19, "elapsed_time": "1:50:27", "remaining_time": "13:15:29"} -{"current_steps": 229, "total_steps": 1870, "loss": 0.5743, "lr": 4.818832706701989e-06, "epoch": 1.2245989304812834, "percentage": 12.25, "elapsed_time": "1:50:31", "remaining_time": "13:11:58"} -{"current_steps": 230, "total_steps": 1870, "loss": 0.8447, "lr": 4.817259726335009e-06, "epoch": 1.2299465240641712, "percentage": 12.3, "elapsed_time": "1:50:32", "remaining_time": "13:08:14"} -{"current_steps": 231, "total_steps": 1870, "loss": 0.8136, "lr": 4.815680205766304e-06, "epoch": 1.2352941176470589, "percentage": 12.35, "elapsed_time": "1:50:40", "remaining_time": "13:05:13"} -{"current_steps": 232, "total_steps": 1870, "loss": 0.4073, "lr": 4.814094149453891e-06, "epoch": 1.2406417112299466, "percentage": 12.41, "elapsed_time": "1:50:42", "remaining_time": "13:01:37"} -{"current_steps": 233, "total_steps": 1870, "loss": 0.6625, "lr": 4.812501561874232e-06, "epoch": 1.2459893048128343, "percentage": 12.46, "elapsed_time": "1:50:45", "remaining_time": "12:58:08"} -{"current_steps": 234, "total_steps": 1870, "loss": 0.6403, "lr": 4.8109024475222255e-06, "epoch": 1.251336898395722, "percentage": 12.51, "elapsed_time": "1:50:49", "remaining_time": "12:54:47"} -{"current_steps": 235, "total_steps": 1870, "loss": 0.8436, "lr": 4.809296810911188e-06, "epoch": 1.2566844919786098, "percentage": 12.57, "elapsed_time": "1:50:51", "remaining_time": "12:51:15"} -{"current_steps": 236, "total_steps": 1870, "loss": 0.8578, "lr": 4.8076846565728475e-06, "epoch": 1.2620320855614973, "percentage": 12.62, "elapsed_time": "1:50:57", "remaining_time": "12:48:16"} -{"current_steps": 237, "total_steps": 1870, "loss": 0.5431, "lr": 4.806065989057326e-06, "epoch": 1.267379679144385, "percentage": 12.67, "elapsed_time": "1:51:01", "remaining_time": "12:44:58"} -{"current_steps": 238, "total_steps": 1870, "loss": 0.4613, "lr": 4.8044408129331266e-06, "epoch": 1.2727272727272727, "percentage": 12.73, "elapsed_time": "1:51:03", "remaining_time": "12:41:32"} -{"current_steps": 239, "total_steps": 1870, "loss": 0.6743, "lr": 4.802809132787125e-06, "epoch": 1.2780748663101604, "percentage": 12.78, "elapsed_time": "1:51:05", "remaining_time": "12:38:08"} -{"current_steps": 240, "total_steps": 1870, "loss": 0.8116, "lr": 4.801170953224554e-06, "epoch": 1.2834224598930482, "percentage": 12.83, "elapsed_time": "1:51:07", "remaining_time": "12:34:42"} -{"current_steps": 241, "total_steps": 1870, "loss": 0.4008, "lr": 4.7995262788689865e-06, "epoch": 1.2887700534759359, "percentage": 12.89, "elapsed_time": "1:51:11", "remaining_time": "12:31:31"} -{"current_steps": 242, "total_steps": 1870, "loss": 0.5953, "lr": 4.797875114362331e-06, "epoch": 1.2941176470588236, "percentage": 12.94, "elapsed_time": "1:51:12", "remaining_time": "12:28:07"} -{"current_steps": 243, "total_steps": 1870, "loss": 0.8779, "lr": 4.796217464364808e-06, "epoch": 1.299465240641711, "percentage": 12.99, "elapsed_time": "1:51:15", "remaining_time": "12:24:55"} -{"current_steps": 244, "total_steps": 1870, "loss": 0.7568, "lr": 4.794553333554949e-06, "epoch": 1.3048128342245988, "percentage": 13.05, "elapsed_time": "1:51:18", "remaining_time": "12:21:47"} -{"current_steps": 245, "total_steps": 1870, "loss": 0.5016, "lr": 4.792882726629572e-06, "epoch": 1.3101604278074865, "percentage": 13.1, "elapsed_time": "1:51:22", "remaining_time": "12:18:45"} -{"current_steps": 246, "total_steps": 1870, "loss": 0.8415, "lr": 4.791205648303775e-06, "epoch": 1.3155080213903743, "percentage": 13.16, "elapsed_time": "1:51:27", "remaining_time": "12:15:45"} -{"current_steps": 247, "total_steps": 1870, "loss": 0.6032, "lr": 4.789522103310922e-06, "epoch": 1.320855614973262, "percentage": 13.21, "elapsed_time": "1:51:31", "remaining_time": "12:12:50"} -{"current_steps": 248, "total_steps": 1870, "loss": 0.6548, "lr": 4.787832096402626e-06, "epoch": 1.3262032085561497, "percentage": 13.26, "elapsed_time": "1:51:36", "remaining_time": "12:09:57"} -{"current_steps": 249, "total_steps": 1870, "loss": 0.6212, "lr": 4.786135632348738e-06, "epoch": 1.3315508021390374, "percentage": 13.32, "elapsed_time": "1:51:39", "remaining_time": "12:06:56"} -{"current_steps": 250, "total_steps": 1870, "loss": 0.8052, "lr": 4.7844327159373365e-06, "epoch": 1.3368983957219251, "percentage": 13.37, "elapsed_time": "1:51:43", "remaining_time": "12:03:55"} -{"current_steps": 251, "total_steps": 1870, "loss": 0.589, "lr": 4.782723351974708e-06, "epoch": 1.3422459893048129, "percentage": 13.42, "elapsed_time": "1:51:46", "remaining_time": "12:00:57"} -{"current_steps": 252, "total_steps": 1870, "loss": 0.63, "lr": 4.7810075452853385e-06, "epoch": 1.3475935828877006, "percentage": 13.48, "elapsed_time": "1:51:49", "remaining_time": "11:57:58"} -{"current_steps": 253, "total_steps": 1870, "loss": 0.6727, "lr": 4.779285300711897e-06, "epoch": 1.3529411764705883, "percentage": 13.53, "elapsed_time": "1:51:53", "remaining_time": "11:55:09"} -{"current_steps": 254, "total_steps": 1870, "loss": 0.4158, "lr": 4.7775566231152216e-06, "epoch": 1.358288770053476, "percentage": 13.58, "elapsed_time": "1:51:55", "remaining_time": "11:52:03"} -{"current_steps": 255, "total_steps": 1870, "loss": 0.8809, "lr": 4.775821517374308e-06, "epoch": 1.3636363636363638, "percentage": 13.64, "elapsed_time": "1:51:56", "remaining_time": "11:48:58"} -{"current_steps": 256, "total_steps": 1870, "loss": 0.5157, "lr": 4.7740799883862966e-06, "epoch": 1.3689839572192513, "percentage": 13.69, "elapsed_time": "1:51:58", "remaining_time": "11:45:55"} -{"current_steps": 257, "total_steps": 1870, "loss": 0.4467, "lr": 4.772332041066452e-06, "epoch": 1.374331550802139, "percentage": 13.74, "elapsed_time": "1:52:00", "remaining_time": "11:42:59"} -{"current_steps": 258, "total_steps": 1870, "loss": 0.9125, "lr": 4.770577680348159e-06, "epoch": 1.3796791443850267, "percentage": 13.8, "elapsed_time": "1:52:02", "remaining_time": "11:40:01"} -{"current_steps": 259, "total_steps": 1870, "loss": 0.4665, "lr": 4.768816911182899e-06, "epoch": 1.3850267379679144, "percentage": 13.85, "elapsed_time": "1:52:03", "remaining_time": "11:37:03"} -{"current_steps": 260, "total_steps": 1870, "loss": 0.5404, "lr": 4.767049738540244e-06, "epoch": 1.3903743315508021, "percentage": 13.9, "elapsed_time": "1:52:05", "remaining_time": "11:34:09"} -{"current_steps": 261, "total_steps": 1870, "loss": 0.4575, "lr": 4.765276167407836e-06, "epoch": 1.3957219251336899, "percentage": 13.96, "elapsed_time": "1:52:11", "remaining_time": "11:31:37"} -{"current_steps": 262, "total_steps": 1870, "loss": 0.8227, "lr": 4.7634962027913784e-06, "epoch": 1.4010695187165776, "percentage": 14.01, "elapsed_time": "1:52:18", "remaining_time": "11:29:16"} -{"current_steps": 263, "total_steps": 1870, "loss": 0.5813, "lr": 4.761709849714619e-06, "epoch": 1.4064171122994653, "percentage": 14.06, "elapsed_time": "1:52:20", "remaining_time": "11:26:23"} -{"current_steps": 264, "total_steps": 1870, "loss": 0.6333, "lr": 4.7599171132193355e-06, "epoch": 1.4117647058823528, "percentage": 14.12, "elapsed_time": "1:52:22", "remaining_time": "11:23:35"} -{"current_steps": 265, "total_steps": 1870, "loss": 0.6368, "lr": 4.7581179983653224e-06, "epoch": 1.4171122994652405, "percentage": 14.17, "elapsed_time": "1:52:26", "remaining_time": "11:21:01"} -{"current_steps": 266, "total_steps": 1870, "loss": 0.4146, "lr": 4.756312510230377e-06, "epoch": 1.4224598930481283, "percentage": 14.22, "elapsed_time": "1:52:27", "remaining_time": "11:18:09"} -{"current_steps": 267, "total_steps": 1870, "loss": 0.6066, "lr": 4.754500653910284e-06, "epoch": 1.427807486631016, "percentage": 14.28, "elapsed_time": "1:52:29", "remaining_time": "11:15:24"} -{"current_steps": 268, "total_steps": 1870, "loss": 0.6254, "lr": 4.752682434518801e-06, "epoch": 1.4331550802139037, "percentage": 14.33, "elapsed_time": "1:52:31", "remaining_time": "11:12:37"} -{"current_steps": 269, "total_steps": 1870, "loss": 0.4853, "lr": 4.750857857187645e-06, "epoch": 1.4385026737967914, "percentage": 14.39, "elapsed_time": "1:52:33", "remaining_time": "11:09:54"} -{"current_steps": 270, "total_steps": 1870, "loss": 0.7066, "lr": 4.749026927066479e-06, "epoch": 1.4438502673796791, "percentage": 14.44, "elapsed_time": "1:52:37", "remaining_time": "11:07:25"} -{"current_steps": 271, "total_steps": 1870, "loss": 0.5224, "lr": 4.747189649322894e-06, "epoch": 1.4491978609625669, "percentage": 14.49, "elapsed_time": "1:52:41", "remaining_time": "11:04:55"} -{"current_steps": 272, "total_steps": 1870, "loss": 0.7391, "lr": 4.745346029142397e-06, "epoch": 1.4545454545454546, "percentage": 14.55, "elapsed_time": "1:52:44", "remaining_time": "11:02:23"} -{"current_steps": 273, "total_steps": 1870, "loss": 0.6529, "lr": 4.743496071728396e-06, "epoch": 1.4598930481283423, "percentage": 14.6, "elapsed_time": "1:52:48", "remaining_time": "10:59:57"} -{"current_steps": 274, "total_steps": 1870, "loss": 0.453, "lr": 4.741639782302187e-06, "epoch": 1.46524064171123, "percentage": 14.65, "elapsed_time": "1:52:52", "remaining_time": "10:57:25"} -{"current_steps": 275, "total_steps": 1870, "loss": 0.5275, "lr": 4.739777166102933e-06, "epoch": 1.4705882352941178, "percentage": 14.71, "elapsed_time": "1:52:53", "remaining_time": "10:54:45"} -{"current_steps": 276, "total_steps": 1870, "loss": 0.5838, "lr": 4.737908228387656e-06, "epoch": 1.4759358288770055, "percentage": 14.76, "elapsed_time": "1:52:57", "remaining_time": "10:52:21"} -{"current_steps": 277, "total_steps": 1870, "loss": 0.5719, "lr": 4.736032974431222e-06, "epoch": 1.481283422459893, "percentage": 14.81, "elapsed_time": "1:53:04", "remaining_time": "10:50:18"} -{"current_steps": 278, "total_steps": 1870, "loss": 0.4318, "lr": 4.7341514095263214e-06, "epoch": 1.4866310160427807, "percentage": 14.87, "elapsed_time": "1:53:07", "remaining_time": "10:47:47"} -{"current_steps": 279, "total_steps": 1870, "loss": 0.6388, "lr": 4.732263538983456e-06, "epoch": 1.4919786096256684, "percentage": 14.92, "elapsed_time": "1:53:08", "remaining_time": "10:45:10"} -{"current_steps": 280, "total_steps": 1870, "loss": 0.6673, "lr": 4.730369368130925e-06, "epoch": 1.4973262032085561, "percentage": 14.97, "elapsed_time": "1:53:09", "remaining_time": "10:42:34"} -{"current_steps": 281, "total_steps": 1870, "loss": 1.2311, "lr": 4.728468902314811e-06, "epoch": 1.5026737967914439, "percentage": 15.03, "elapsed_time": "1:53:13", "remaining_time": "10:40:13"} -{"current_steps": 282, "total_steps": 1870, "loss": 0.6467, "lr": 4.726562146898963e-06, "epoch": 1.5080213903743316, "percentage": 15.08, "elapsed_time": "1:53:15", "remaining_time": "10:37:47"} -{"current_steps": 283, "total_steps": 1870, "loss": 0.6265, "lr": 4.72464910726498e-06, "epoch": 1.5133689839572193, "percentage": 15.13, "elapsed_time": "1:53:22", "remaining_time": "10:35:45"} -{"current_steps": 284, "total_steps": 1870, "loss": 0.8415, "lr": 4.7227297888121985e-06, "epoch": 1.5187165775401068, "percentage": 15.19, "elapsed_time": "1:53:25", "remaining_time": "10:33:25"} -{"current_steps": 285, "total_steps": 1870, "loss": 0.6441, "lr": 4.720804196957676e-06, "epoch": 1.5240641711229945, "percentage": 15.24, "elapsed_time": "1:53:27", "remaining_time": "10:30:57"} -{"current_steps": 286, "total_steps": 1870, "loss": 0.8297, "lr": 4.718872337136176e-06, "epoch": 1.5294117647058822, "percentage": 15.29, "elapsed_time": "1:53:28", "remaining_time": "10:28:29"} -{"current_steps": 287, "total_steps": 1870, "loss": 0.9988, "lr": 4.716934214800155e-06, "epoch": 1.53475935828877, "percentage": 15.35, "elapsed_time": "1:53:31", "remaining_time": "10:26:09"} -{"current_steps": 288, "total_steps": 1870, "loss": 0.5931, "lr": 4.714989835419741e-06, "epoch": 1.5401069518716577, "percentage": 15.4, "elapsed_time": "1:53:35", "remaining_time": "10:23:59"} -{"current_steps": 289, "total_steps": 1870, "loss": 0.5902, "lr": 4.713039204482723e-06, "epoch": 1.5454545454545454, "percentage": 15.45, "elapsed_time": "1:53:37", "remaining_time": "10:21:34"} -{"current_steps": 290, "total_steps": 1870, "loss": 0.7356, "lr": 4.711082327494536e-06, "epoch": 1.5508021390374331, "percentage": 15.51, "elapsed_time": "1:53:38", "remaining_time": "10:19:08"} -{"current_steps": 291, "total_steps": 1870, "loss": 0.529, "lr": 4.709119209978242e-06, "epoch": 1.5561497326203209, "percentage": 15.56, "elapsed_time": "1:53:40", "remaining_time": "10:16:46"} -{"current_steps": 292, "total_steps": 1870, "loss": 0.4536, "lr": 4.707149857474516e-06, "epoch": 1.5614973262032086, "percentage": 15.61, "elapsed_time": "1:53:41", "remaining_time": "10:14:25"} -{"current_steps": 293, "total_steps": 1870, "loss": 0.5565, "lr": 4.705174275541632e-06, "epoch": 1.5668449197860963, "percentage": 15.67, "elapsed_time": "1:53:45", "remaining_time": "10:12:15"} -{"current_steps": 294, "total_steps": 1870, "loss": 0.728, "lr": 4.703192469755444e-06, "epoch": 1.572192513368984, "percentage": 15.72, "elapsed_time": "1:53:50", "remaining_time": "10:10:12"} -{"current_steps": 295, "total_steps": 1870, "loss": 0.6269, "lr": 4.701204445709375e-06, "epoch": 1.5775401069518717, "percentage": 15.78, "elapsed_time": "1:53:56", "remaining_time": "10:08:18"} -{"current_steps": 296, "total_steps": 1870, "loss": 0.658, "lr": 4.699210209014394e-06, "epoch": 1.5828877005347595, "percentage": 15.83, "elapsed_time": "1:53:58", "remaining_time": "10:06:05"} -{"current_steps": 297, "total_steps": 1870, "loss": 0.5184, "lr": 4.69720976529901e-06, "epoch": 1.5882352941176472, "percentage": 15.88, "elapsed_time": "1:54:02", "remaining_time": "10:03:57"} -{"current_steps": 298, "total_steps": 1870, "loss": 0.5321, "lr": 4.695203120209245e-06, "epoch": 1.593582887700535, "percentage": 15.94, "elapsed_time": "1:54:03", "remaining_time": "10:01:40"} -{"current_steps": 299, "total_steps": 1870, "loss": 0.4647, "lr": 4.693190279408628e-06, "epoch": 1.5989304812834224, "percentage": 15.99, "elapsed_time": "1:54:07", "remaining_time": "9:59:39"} -{"current_steps": 300, "total_steps": 1870, "loss": 0.4889, "lr": 4.691171248578172e-06, "epoch": 1.6042780748663101, "percentage": 16.04, "elapsed_time": "1:54:10", "remaining_time": "9:57:31"} -{"current_steps": 301, "total_steps": 1870, "loss": 0.6621, "lr": 4.689146033416362e-06, "epoch": 1.6096256684491979, "percentage": 16.1, "elapsed_time": "1:54:13", "remaining_time": "9:55:24"} -{"current_steps": 302, "total_steps": 1870, "loss": 0.4009, "lr": 4.687114639639136e-06, "epoch": 1.6149732620320856, "percentage": 16.15, "elapsed_time": "1:54:16", "remaining_time": "9:53:21"} -{"current_steps": 303, "total_steps": 1870, "loss": 0.5065, "lr": 4.685077072979874e-06, "epoch": 1.6203208556149733, "percentage": 16.2, "elapsed_time": "1:54:18", "remaining_time": "9:51:09"} -{"current_steps": 304, "total_steps": 1870, "loss": 0.5289, "lr": 4.683033339189375e-06, "epoch": 1.6256684491978608, "percentage": 16.26, "elapsed_time": "1:54:20", "remaining_time": "9:49:01"} -{"current_steps": 305, "total_steps": 1870, "loss": 0.7078, "lr": 4.680983444035843e-06, "epoch": 1.6310160427807485, "percentage": 16.31, "elapsed_time": "1:54:26", "remaining_time": "9:47:11"} -{"current_steps": 306, "total_steps": 1870, "loss": 0.4003, "lr": 4.678927393304877e-06, "epoch": 1.6363636363636362, "percentage": 16.36, "elapsed_time": "1:54:27", "remaining_time": "9:44:58"} -{"current_steps": 307, "total_steps": 1870, "loss": 0.4802, "lr": 4.676865192799443e-06, "epoch": 1.641711229946524, "percentage": 16.42, "elapsed_time": "1:54:29", "remaining_time": "9:42:53"} -{"current_steps": 308, "total_steps": 1870, "loss": 0.8128, "lr": 4.6747968483398695e-06, "epoch": 1.6470588235294117, "percentage": 16.47, "elapsed_time": "1:54:30", "remaining_time": "9:40:44"} -{"current_steps": 309, "total_steps": 1870, "loss": 0.4085, "lr": 4.672722365763821e-06, "epoch": 1.6524064171122994, "percentage": 16.52, "elapsed_time": "1:54:32", "remaining_time": "9:38:40"} -{"current_steps": 310, "total_steps": 1870, "loss": 0.5707, "lr": 4.6706417509262905e-06, "epoch": 1.6577540106951871, "percentage": 16.58, "elapsed_time": "1:54:34", "remaining_time": "9:36:34"} -{"current_steps": 311, "total_steps": 1870, "loss": 0.481, "lr": 4.668555009699575e-06, "epoch": 1.6631016042780749, "percentage": 16.63, "elapsed_time": "1:54:37", "remaining_time": "9:34:36"} -{"current_steps": 312, "total_steps": 1870, "loss": 0.6021, "lr": 4.666462147973264e-06, "epoch": 1.6684491978609626, "percentage": 16.68, "elapsed_time": "1:54:39", "remaining_time": "9:32:35"} -{"current_steps": 313, "total_steps": 1870, "loss": 0.7208, "lr": 4.664363171654223e-06, "epoch": 1.6737967914438503, "percentage": 16.74, "elapsed_time": "1:54:42", "remaining_time": "9:30:38"} -{"current_steps": 314, "total_steps": 1870, "loss": 0.9136, "lr": 4.662258086666571e-06, "epoch": 1.679144385026738, "percentage": 16.79, "elapsed_time": "1:54:45", "remaining_time": "9:28:40"} -{"current_steps": 315, "total_steps": 1870, "loss": 0.7375, "lr": 4.660146898951674e-06, "epoch": 1.6844919786096257, "percentage": 16.84, "elapsed_time": "1:54:49", "remaining_time": "9:26:52"} -{"current_steps": 316, "total_steps": 1870, "loss": 0.6786, "lr": 4.6580296144681155e-06, "epoch": 1.6898395721925135, "percentage": 16.9, "elapsed_time": "1:54:55", "remaining_time": "9:25:10"} -{"current_steps": 317, "total_steps": 1870, "loss": 0.789, "lr": 4.655906239191693e-06, "epoch": 1.6951871657754012, "percentage": 16.95, "elapsed_time": "1:54:57", "remaining_time": "9:23:10"} -{"current_steps": 318, "total_steps": 1870, "loss": 0.7104, "lr": 4.653776779115389e-06, "epoch": 1.700534759358289, "percentage": 17.01, "elapsed_time": "1:55:00", "remaining_time": "9:21:16"} -{"current_steps": 319, "total_steps": 1870, "loss": 0.5165, "lr": 4.651641240249364e-06, "epoch": 1.7058823529411766, "percentage": 17.06, "elapsed_time": "1:55:06", "remaining_time": "9:19:39"} -{"current_steps": 320, "total_steps": 1870, "loss": 0.4081, "lr": 4.649499628620931e-06, "epoch": 1.7112299465240641, "percentage": 17.11, "elapsed_time": "1:55:08", "remaining_time": "9:17:43"} -{"current_steps": 321, "total_steps": 1870, "loss": 0.6536, "lr": 4.647351950274548e-06, "epoch": 1.7165775401069518, "percentage": 17.17, "elapsed_time": "1:55:11", "remaining_time": "9:15:52"} -{"current_steps": 322, "total_steps": 1870, "loss": 0.6597, "lr": 4.6451982112717896e-06, "epoch": 1.7219251336898396, "percentage": 17.22, "elapsed_time": "1:55:13", "remaining_time": "9:13:57"} -{"current_steps": 323, "total_steps": 1870, "loss": 0.7608, "lr": 4.643038417691341e-06, "epoch": 1.7272727272727273, "percentage": 17.27, "elapsed_time": "1:55:18", "remaining_time": "9:12:15"} -{"current_steps": 324, "total_steps": 1870, "loss": 0.4597, "lr": 4.640872575628973e-06, "epoch": 1.732620320855615, "percentage": 17.33, "elapsed_time": "1:55:22", "remaining_time": "9:10:33"} -{"current_steps": 325, "total_steps": 1870, "loss": 0.7241, "lr": 4.6387006911975275e-06, "epoch": 1.7379679144385025, "percentage": 17.38, "elapsed_time": "1:55:27", "remaining_time": "9:08:51"} -{"current_steps": 326, "total_steps": 1870, "loss": 0.7654, "lr": 4.6365227705269026e-06, "epoch": 1.7433155080213902, "percentage": 17.43, "elapsed_time": "1:55:29", "remaining_time": "9:06:57"} -{"current_steps": 327, "total_steps": 1870, "loss": 0.6391, "lr": 4.634338819764029e-06, "epoch": 1.748663101604278, "percentage": 17.49, "elapsed_time": "1:55:34", "remaining_time": "9:05:23"} -{"current_steps": 328, "total_steps": 1870, "loss": 0.5501, "lr": 4.632148845072861e-06, "epoch": 1.7540106951871657, "percentage": 17.54, "elapsed_time": "1:55:35", "remaining_time": "9:03:26"} -{"current_steps": 329, "total_steps": 1870, "loss": 0.6117, "lr": 4.6299528526343525e-06, "epoch": 1.7593582887700534, "percentage": 17.59, "elapsed_time": "1:55:39", "remaining_time": "9:01:42"} -{"current_steps": 330, "total_steps": 1870, "loss": 0.8534, "lr": 4.627750848646443e-06, "epoch": 1.7647058823529411, "percentage": 17.65, "elapsed_time": "1:55:41", "remaining_time": "8:59:53"} -{"current_steps": 331, "total_steps": 1870, "loss": 0.6352, "lr": 4.625542839324036e-06, "epoch": 1.7700534759358288, "percentage": 17.7, "elapsed_time": "1:55:47", "remaining_time": "8:58:21"} -{"current_steps": 332, "total_steps": 1870, "loss": 0.4188, "lr": 4.6233288308989874e-06, "epoch": 1.7754010695187166, "percentage": 17.75, "elapsed_time": "1:55:53", "remaining_time": "8:56:50"} -{"current_steps": 333, "total_steps": 1870, "loss": 0.4464, "lr": 4.6211088296200834e-06, "epoch": 1.7807486631016043, "percentage": 17.81, "elapsed_time": "1:55:54", "remaining_time": "8:55:01"} -{"current_steps": 334, "total_steps": 1870, "loss": 0.6833, "lr": 4.618882841753026e-06, "epoch": 1.786096256684492, "percentage": 17.86, "elapsed_time": "1:55:57", "remaining_time": "8:53:14"} -{"current_steps": 335, "total_steps": 1870, "loss": 0.6356, "lr": 4.616650873580411e-06, "epoch": 1.7914438502673797, "percentage": 17.91, "elapsed_time": "1:56:00", "remaining_time": "8:51:31"} -{"current_steps": 336, "total_steps": 1870, "loss": 0.5413, "lr": 4.614412931401715e-06, "epoch": 1.7967914438502675, "percentage": 17.97, "elapsed_time": "1:56:03", "remaining_time": "8:49:51"} -{"current_steps": 337, "total_steps": 1870, "loss": 0.5275, "lr": 4.612169021533276e-06, "epoch": 1.8021390374331552, "percentage": 18.02, "elapsed_time": "1:56:06", "remaining_time": "8:48:12"} -{"current_steps": 338, "total_steps": 1870, "loss": 0.6292, "lr": 4.609919150308273e-06, "epoch": 1.807486631016043, "percentage": 18.07, "elapsed_time": "1:56:09", "remaining_time": "8:46:28"} -{"current_steps": 339, "total_steps": 1870, "loss": 0.5315, "lr": 4.607663324076711e-06, "epoch": 1.8128342245989306, "percentage": 18.13, "elapsed_time": "1:56:10", "remaining_time": "8:44:38"} -{"current_steps": 340, "total_steps": 1870, "loss": 0.7492, "lr": 4.605401549205404e-06, "epoch": 1.8181818181818183, "percentage": 18.18, "elapsed_time": "1:56:12", "remaining_time": "8:42:55"} -{"current_steps": 341, "total_steps": 1870, "loss": 0.6453, "lr": 4.603133832077953e-06, "epoch": 1.8235294117647058, "percentage": 18.24, "elapsed_time": "1:56:13", "remaining_time": "8:41:08"} -{"current_steps": 342, "total_steps": 1870, "loss": 0.6502, "lr": 4.600860179094732e-06, "epoch": 1.8288770053475936, "percentage": 18.29, "elapsed_time": "1:56:17", "remaining_time": "8:39:36"} -{"current_steps": 343, "total_steps": 1870, "loss": 0.6807, "lr": 4.5985805966728675e-06, "epoch": 1.8342245989304813, "percentage": 18.34, "elapsed_time": "1:56:19", "remaining_time": "8:37:49"} -{"current_steps": 344, "total_steps": 1870, "loss": 0.5235, "lr": 4.596295091246221e-06, "epoch": 1.839572192513369, "percentage": 18.4, "elapsed_time": "1:56:20", "remaining_time": "8:36:05"} -{"current_steps": 345, "total_steps": 1870, "loss": 0.5847, "lr": 4.594003669265371e-06, "epoch": 1.8449197860962567, "percentage": 18.45, "elapsed_time": "1:56:22", "remaining_time": "8:34:26"} -{"current_steps": 346, "total_steps": 1870, "loss": 0.6266, "lr": 4.591706337197597e-06, "epoch": 1.8502673796791442, "percentage": 18.5, "elapsed_time": "1:56:29", "remaining_time": "8:33:07"} -{"current_steps": 347, "total_steps": 1870, "loss": 0.5021, "lr": 4.589403101526854e-06, "epoch": 1.855614973262032, "percentage": 18.56, "elapsed_time": "1:56:32", "remaining_time": "8:31:30"} -{"current_steps": 348, "total_steps": 1870, "loss": 0.6426, "lr": 4.587093968753765e-06, "epoch": 1.8609625668449197, "percentage": 18.61, "elapsed_time": "1:56:34", "remaining_time": "8:29:53"} -{"current_steps": 349, "total_steps": 1870, "loss": 0.41, "lr": 4.584778945395594e-06, "epoch": 1.8663101604278074, "percentage": 18.66, "elapsed_time": "1:56:39", "remaining_time": "8:28:25"} -{"current_steps": 350, "total_steps": 1870, "loss": 0.6775, "lr": 4.582458037986231e-06, "epoch": 1.8716577540106951, "percentage": 18.72, "elapsed_time": "1:56:40", "remaining_time": "8:26:44"} -{"current_steps": 351, "total_steps": 1870, "loss": 0.9407, "lr": 4.580131253076171e-06, "epoch": 1.8770053475935828, "percentage": 18.77, "elapsed_time": "1:56:44", "remaining_time": "8:25:10"} -{"current_steps": 352, "total_steps": 1870, "loss": 0.6412, "lr": 4.5777985972325016e-06, "epoch": 1.8823529411764706, "percentage": 18.82, "elapsed_time": "1:56:47", "remaining_time": "8:23:38"} -{"current_steps": 353, "total_steps": 1870, "loss": 0.4353, "lr": 4.575460077038877e-06, "epoch": 1.8877005347593583, "percentage": 18.88, "elapsed_time": "1:56:49", "remaining_time": "8:22:02"} -{"current_steps": 354, "total_steps": 1870, "loss": 0.934, "lr": 4.573115699095505e-06, "epoch": 1.893048128342246, "percentage": 18.93, "elapsed_time": "1:56:52", "remaining_time": "8:20:32"} -{"current_steps": 355, "total_steps": 1870, "loss": 0.472, "lr": 4.570765470019125e-06, "epoch": 1.8983957219251337, "percentage": 18.98, "elapsed_time": "1:56:54", "remaining_time": "8:18:55"} -{"current_steps": 356, "total_steps": 1870, "loss": 0.6079, "lr": 4.5684093964429906e-06, "epoch": 1.9037433155080214, "percentage": 19.04, "elapsed_time": "1:56:58", "remaining_time": "8:17:26"} -{"current_steps": 357, "total_steps": 1870, "loss": 0.4644, "lr": 4.566047485016853e-06, "epoch": 1.9090909090909092, "percentage": 19.09, "elapsed_time": "1:57:01", "remaining_time": "8:15:59"} -{"current_steps": 358, "total_steps": 1870, "loss": 0.721, "lr": 4.563679742406935e-06, "epoch": 1.914438502673797, "percentage": 19.14, "elapsed_time": "1:57:03", "remaining_time": "8:14:22"} -{"current_steps": 359, "total_steps": 1870, "loss": 1.0296, "lr": 4.5613061752959236e-06, "epoch": 1.9197860962566846, "percentage": 19.2, "elapsed_time": "1:57:06", "remaining_time": "8:12:53"} -{"current_steps": 360, "total_steps": 1870, "loss": 0.892, "lr": 4.558926790382941e-06, "epoch": 1.9251336898395723, "percentage": 19.25, "elapsed_time": "1:57:09", "remaining_time": "8:11:24"} -{"current_steps": 361, "total_steps": 1870, "loss": 0.6153, "lr": 4.556541594383528e-06, "epoch": 1.93048128342246, "percentage": 19.3, "elapsed_time": "1:57:12", "remaining_time": "8:09:56"} -{"current_steps": 362, "total_steps": 1870, "loss": 0.3246, "lr": 4.554150594029631e-06, "epoch": 1.9358288770053476, "percentage": 19.36, "elapsed_time": "1:57:14", "remaining_time": "8:08:23"} -{"current_steps": 363, "total_steps": 1870, "loss": 0.5986, "lr": 4.551753796069577e-06, "epoch": 1.9411764705882353, "percentage": 19.41, "elapsed_time": "1:57:17", "remaining_time": "8:06:57"} -{"current_steps": 364, "total_steps": 1870, "loss": 0.5642, "lr": 4.5493512072680535e-06, "epoch": 1.946524064171123, "percentage": 19.47, "elapsed_time": "1:57:22", "remaining_time": "8:05:37"} -{"current_steps": 365, "total_steps": 1870, "loss": 0.7661, "lr": 4.546942834406094e-06, "epoch": 1.9518716577540107, "percentage": 19.52, "elapsed_time": "1:57:25", "remaining_time": "8:04:10"} -{"current_steps": 366, "total_steps": 1870, "loss": 0.4739, "lr": 4.544528684281056e-06, "epoch": 1.9572192513368984, "percentage": 19.57, "elapsed_time": "1:57:26", "remaining_time": "8:02:36"} -{"current_steps": 367, "total_steps": 1870, "loss": 0.4551, "lr": 4.5421087637066065e-06, "epoch": 1.962566844919786, "percentage": 19.63, "elapsed_time": "1:57:30", "remaining_time": "8:01:13"} -{"current_steps": 368, "total_steps": 1870, "loss": 0.7336, "lr": 4.539683079512692e-06, "epoch": 1.9679144385026737, "percentage": 19.68, "elapsed_time": "1:57:33", "remaining_time": "7:59:50"} -{"current_steps": 369, "total_steps": 1870, "loss": 0.5833, "lr": 4.537251638545532e-06, "epoch": 1.9732620320855614, "percentage": 19.73, "elapsed_time": "1:57:36", "remaining_time": "7:58:24"} -{"current_steps": 370, "total_steps": 1870, "loss": 0.3305, "lr": 4.534814447667591e-06, "epoch": 1.9786096256684491, "percentage": 19.79, "elapsed_time": "1:57:38", "remaining_time": "7:56:55"} -{"current_steps": 371, "total_steps": 1870, "loss": 0.4912, "lr": 4.532371513757564e-06, "epoch": 1.9839572192513368, "percentage": 19.84, "elapsed_time": "1:57:42", "remaining_time": "7:55:35"} -{"current_steps": 372, "total_steps": 1870, "loss": 0.611, "lr": 4.529922843710354e-06, "epoch": 1.9893048128342246, "percentage": 19.89, "elapsed_time": "1:57:46", "remaining_time": "7:54:14"} -{"current_steps": 373, "total_steps": 1870, "loss": 0.6487, "lr": 4.52746844443705e-06, "epoch": 1.9946524064171123, "percentage": 19.95, "elapsed_time": "1:57:52", "remaining_time": "7:53:04"} -{"current_steps": 374, "total_steps": 1870, "loss": 0.607, "lr": 4.525008322864917e-06, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "1:57:55", "remaining_time": "7:51:42"} -{"current_steps": 375, "total_steps": 1870, "loss": 0.4111, "lr": 4.522542485937369e-06, "epoch": 2.0053475935828877, "percentage": 20.05, "elapsed_time": "2:52:28", "remaining_time": "11:27:37"} -{"current_steps": 376, "total_steps": 1870, "loss": 0.3418, "lr": 4.520070940613948e-06, "epoch": 2.0106951871657754, "percentage": 20.11, "elapsed_time": "2:52:32", "remaining_time": "11:25:36"} -{"current_steps": 377, "total_steps": 1870, "loss": 0.3853, "lr": 4.51759369387031e-06, "epoch": 2.016042780748663, "percentage": 20.16, "elapsed_time": "2:52:36", "remaining_time": "11:23:33"} -{"current_steps": 378, "total_steps": 1870, "loss": 0.4586, "lr": 4.515110752698203e-06, "epoch": 2.021390374331551, "percentage": 20.21, "elapsed_time": "2:52:39", "remaining_time": "11:21:30"} -{"current_steps": 379, "total_steps": 1870, "loss": 0.5457, "lr": 4.512622124105444e-06, "epoch": 2.0267379679144386, "percentage": 20.27, "elapsed_time": "2:52:42", "remaining_time": "11:19:28"} -{"current_steps": 380, "total_steps": 1870, "loss": 0.5248, "lr": 4.510127815115904e-06, "epoch": 2.0320855614973263, "percentage": 20.32, "elapsed_time": "2:52:46", "remaining_time": "11:17:28"} -{"current_steps": 381, "total_steps": 1870, "loss": 0.3257, "lr": 4.507627832769486e-06, "epoch": 2.037433155080214, "percentage": 20.37, "elapsed_time": "2:52:49", "remaining_time": "11:15:23"} -{"current_steps": 382, "total_steps": 1870, "loss": 0.6607, "lr": 4.505122184122107e-06, "epoch": 2.0427807486631018, "percentage": 20.43, "elapsed_time": "2:52:51", "remaining_time": "11:13:18"} -{"current_steps": 383, "total_steps": 1870, "loss": 0.285, "lr": 4.502610876245674e-06, "epoch": 2.0481283422459895, "percentage": 20.48, "elapsed_time": "2:52:55", "remaining_time": "11:11:21"} -{"current_steps": 384, "total_steps": 1870, "loss": 0.2656, "lr": 4.500093916228068e-06, "epoch": 2.053475935828877, "percentage": 20.53, "elapsed_time": "2:52:57", "remaining_time": "11:09:18"} -{"current_steps": 385, "total_steps": 1870, "loss": 0.7756, "lr": 4.4975713111731206e-06, "epoch": 2.0588235294117645, "percentage": 20.59, "elapsed_time": "2:52:59", "remaining_time": "11:07:13"} -{"current_steps": 386, "total_steps": 1870, "loss": 0.2841, "lr": 4.4950430682005995e-06, "epoch": 2.064171122994652, "percentage": 20.64, "elapsed_time": "2:53:02", "remaining_time": "11:05:17"} -{"current_steps": 387, "total_steps": 1870, "loss": 0.375, "lr": 4.49250919444618e-06, "epoch": 2.06951871657754, "percentage": 20.7, "elapsed_time": "2:53:06", "remaining_time": "11:03:21"} -{"current_steps": 388, "total_steps": 1870, "loss": 0.4506, "lr": 4.489969697061436e-06, "epoch": 2.0748663101604277, "percentage": 20.75, "elapsed_time": "2:53:08", "remaining_time": "11:01:21"} -{"current_steps": 389, "total_steps": 1870, "loss": 0.3308, "lr": 4.487424583213807e-06, "epoch": 2.0802139037433154, "percentage": 20.8, "elapsed_time": "2:53:11", "remaining_time": "10:59:21"} -{"current_steps": 390, "total_steps": 1870, "loss": 0.3271, "lr": 4.484873860086586e-06, "epoch": 2.085561497326203, "percentage": 20.86, "elapsed_time": "2:53:14", "remaining_time": "10:57:26"} -{"current_steps": 391, "total_steps": 1870, "loss": 0.22, "lr": 4.482317534878901e-06, "epoch": 2.090909090909091, "percentage": 20.91, "elapsed_time": "2:53:16", "remaining_time": "10:55:24"} -{"current_steps": 392, "total_steps": 1870, "loss": 0.4323, "lr": 4.4797556148056884e-06, "epoch": 2.0962566844919786, "percentage": 20.96, "elapsed_time": "2:53:20", "remaining_time": "10:53:33"} -{"current_steps": 393, "total_steps": 1870, "loss": 0.5702, "lr": 4.477188107097675e-06, "epoch": 2.1016042780748663, "percentage": 21.02, "elapsed_time": "2:53:22", "remaining_time": "10:51:35"} -{"current_steps": 394, "total_steps": 1870, "loss": 0.2821, "lr": 4.474615019001359e-06, "epoch": 2.106951871657754, "percentage": 21.07, "elapsed_time": "2:53:27", "remaining_time": "10:49:47"} -{"current_steps": 395, "total_steps": 1870, "loss": 0.6381, "lr": 4.47203635777899e-06, "epoch": 2.1122994652406417, "percentage": 21.12, "elapsed_time": "2:53:30", "remaining_time": "10:47:54"} -{"current_steps": 396, "total_steps": 1870, "loss": 0.5999, "lr": 4.469452130708544e-06, "epoch": 2.1176470588235294, "percentage": 21.18, "elapsed_time": "2:53:32", "remaining_time": "10:45:57"} -{"current_steps": 397, "total_steps": 1870, "loss": 0.2416, "lr": 4.4668623450837085e-06, "epoch": 2.122994652406417, "percentage": 21.23, "elapsed_time": "2:53:33", "remaining_time": "10:43:59"} -{"current_steps": 398, "total_steps": 1870, "loss": 0.6149, "lr": 4.464267008213858e-06, "epoch": 2.128342245989305, "percentage": 21.28, "elapsed_time": "2:53:36", "remaining_time": "10:42:05"} -{"current_steps": 399, "total_steps": 1870, "loss": 0.4445, "lr": 4.461666127424036e-06, "epoch": 2.1336898395721926, "percentage": 21.34, "elapsed_time": "2:53:38", "remaining_time": "10:40:09"} -{"current_steps": 400, "total_steps": 1870, "loss": 0.4432, "lr": 4.459059710054933e-06, "epoch": 2.1390374331550803, "percentage": 21.39, "elapsed_time": "2:53:42", "remaining_time": "10:38:22"} -{"current_steps": 401, "total_steps": 1870, "loss": 0.6437, "lr": 4.456447763462863e-06, "epoch": 2.144385026737968, "percentage": 21.44, "elapsed_time": "2:53:44", "remaining_time": "10:36:27"} -{"current_steps": 402, "total_steps": 1870, "loss": 0.3432, "lr": 4.453830295019749e-06, "epoch": 2.1497326203208558, "percentage": 21.5, "elapsed_time": "2:53:46", "remaining_time": "10:34:34"} -{"current_steps": 403, "total_steps": 1870, "loss": 0.5478, "lr": 4.4512073121130985e-06, "epoch": 2.1550802139037435, "percentage": 21.55, "elapsed_time": "2:53:53", "remaining_time": "10:33:00"} -{"current_steps": 404, "total_steps": 1870, "loss": 0.2706, "lr": 4.448578822145982e-06, "epoch": 2.160427807486631, "percentage": 21.6, "elapsed_time": "2:53:54", "remaining_time": "10:31:04"} -{"current_steps": 405, "total_steps": 1870, "loss": 0.2337, "lr": 4.445944832537011e-06, "epoch": 2.165775401069519, "percentage": 21.66, "elapsed_time": "2:53:56", "remaining_time": "10:29:12"} -{"current_steps": 406, "total_steps": 1870, "loss": 0.551, "lr": 4.443305350720324e-06, "epoch": 2.171122994652406, "percentage": 21.71, "elapsed_time": "2:53:58", "remaining_time": "10:27:19"} -{"current_steps": 407, "total_steps": 1870, "loss": 0.4896, "lr": 4.440660384145557e-06, "epoch": 2.176470588235294, "percentage": 21.76, "elapsed_time": "2:54:01", "remaining_time": "10:25:31"} -{"current_steps": 408, "total_steps": 1870, "loss": 0.4094, "lr": 4.438009940277825e-06, "epoch": 2.1818181818181817, "percentage": 21.82, "elapsed_time": "2:54:06", "remaining_time": "10:23:51"} -{"current_steps": 409, "total_steps": 1870, "loss": 0.2985, "lr": 4.435354026597707e-06, "epoch": 2.1871657754010694, "percentage": 21.87, "elapsed_time": "2:54:07", "remaining_time": "10:21:59"} -{"current_steps": 410, "total_steps": 1870, "loss": 0.6087, "lr": 4.432692650601215e-06, "epoch": 2.192513368983957, "percentage": 21.93, "elapsed_time": "2:54:09", "remaining_time": "10:20:10"} -{"current_steps": 411, "total_steps": 1870, "loss": 0.2324, "lr": 4.43002581979978e-06, "epoch": 2.197860962566845, "percentage": 21.98, "elapsed_time": "2:54:11", "remaining_time": "10:18:19"} -{"current_steps": 412, "total_steps": 1870, "loss": 0.2497, "lr": 4.42735354172023e-06, "epoch": 2.2032085561497325, "percentage": 22.03, "elapsed_time": "2:54:12", "remaining_time": "10:16:30"} -{"current_steps": 413, "total_steps": 1870, "loss": 0.5508, "lr": 4.4246758239047636e-06, "epoch": 2.2085561497326203, "percentage": 22.09, "elapsed_time": "2:54:16", "remaining_time": "10:14:49"} -{"current_steps": 414, "total_steps": 1870, "loss": 0.248, "lr": 4.421992673910934e-06, "epoch": 2.213903743315508, "percentage": 22.14, "elapsed_time": "2:54:17", "remaining_time": "10:12:59"} -{"current_steps": 415, "total_steps": 1870, "loss": 0.3047, "lr": 4.4193040993116284e-06, "epoch": 2.2192513368983957, "percentage": 22.19, "elapsed_time": "2:54:20", "remaining_time": "10:11:13"} -{"current_steps": 416, "total_steps": 1870, "loss": 0.2304, "lr": 4.416610107695043e-06, "epoch": 2.2245989304812834, "percentage": 22.25, "elapsed_time": "2:54:23", "remaining_time": "10:09:33"} -{"current_steps": 417, "total_steps": 1870, "loss": 0.4514, "lr": 4.413910706664659e-06, "epoch": 2.229946524064171, "percentage": 22.3, "elapsed_time": "2:54:29", "remaining_time": "10:08:01"} -{"current_steps": 418, "total_steps": 1870, "loss": 0.626, "lr": 4.411205903839232e-06, "epoch": 2.235294117647059, "percentage": 22.35, "elapsed_time": "2:54:31", "remaining_time": "10:06:15"} -{"current_steps": 419, "total_steps": 1870, "loss": 0.4973, "lr": 4.408495706852758e-06, "epoch": 2.2406417112299466, "percentage": 22.41, "elapsed_time": "2:54:34", "remaining_time": "10:04:33"} -{"current_steps": 420, "total_steps": 1870, "loss": 0.6283, "lr": 4.40578012335446e-06, "epoch": 2.2459893048128343, "percentage": 22.46, "elapsed_time": "2:54:36", "remaining_time": "10:02:48"} -{"current_steps": 421, "total_steps": 1870, "loss": 0.4237, "lr": 4.403059161008762e-06, "epoch": 2.251336898395722, "percentage": 22.51, "elapsed_time": "2:54:39", "remaining_time": "10:01:07"} -{"current_steps": 422, "total_steps": 1870, "loss": 0.4727, "lr": 4.4003328274952735e-06, "epoch": 2.2566844919786098, "percentage": 22.57, "elapsed_time": "2:54:40", "remaining_time": "9:59:21"} -{"current_steps": 423, "total_steps": 1870, "loss": 0.488, "lr": 4.397601130508757e-06, "epoch": 2.2620320855614975, "percentage": 22.62, "elapsed_time": "2:54:43", "remaining_time": "9:57:41"} -{"current_steps": 424, "total_steps": 1870, "loss": 0.4013, "lr": 4.394864077759119e-06, "epoch": 2.267379679144385, "percentage": 22.67, "elapsed_time": "2:54:45", "remaining_time": "9:55:58"} -{"current_steps": 425, "total_steps": 1870, "loss": 0.338, "lr": 4.392121676971377e-06, "epoch": 2.2727272727272725, "percentage": 22.73, "elapsed_time": "2:54:48", "remaining_time": "9:54:20"} -{"current_steps": 426, "total_steps": 1870, "loss": 0.647, "lr": 4.3893739358856465e-06, "epoch": 2.2780748663101607, "percentage": 22.78, "elapsed_time": "2:54:50", "remaining_time": "9:52:40"} -{"current_steps": 427, "total_steps": 1870, "loss": 0.5964, "lr": 4.386620862257113e-06, "epoch": 2.283422459893048, "percentage": 22.83, "elapsed_time": "2:54:52", "remaining_time": "9:50:59"} -{"current_steps": 428, "total_steps": 1870, "loss": 0.4262, "lr": 4.383862463856013e-06, "epoch": 2.2887700534759357, "percentage": 22.89, "elapsed_time": "2:54:54", "remaining_time": "9:49:17"} -{"current_steps": 429, "total_steps": 1870, "loss": 0.5824, "lr": 4.3810987484676126e-06, "epoch": 2.2941176470588234, "percentage": 22.94, "elapsed_time": "2:55:00", "remaining_time": "9:47:51"} -{"current_steps": 430, "total_steps": 1870, "loss": 0.4735, "lr": 4.378329723892184e-06, "epoch": 2.299465240641711, "percentage": 22.99, "elapsed_time": "2:55:02", "remaining_time": "9:46:09"} -{"current_steps": 431, "total_steps": 1870, "loss": 0.2989, "lr": 4.375555397944983e-06, "epoch": 2.304812834224599, "percentage": 23.05, "elapsed_time": "2:55:05", "remaining_time": "9:44:35"} -{"current_steps": 432, "total_steps": 1870, "loss": 0.3582, "lr": 4.37277577845623e-06, "epoch": 2.3101604278074865, "percentage": 23.1, "elapsed_time": "2:55:12", "remaining_time": "9:43:14"} -{"current_steps": 433, "total_steps": 1870, "loss": 0.3717, "lr": 4.369990873271082e-06, "epoch": 2.3155080213903743, "percentage": 23.16, "elapsed_time": "2:55:13", "remaining_time": "9:41:32"} -{"current_steps": 434, "total_steps": 1870, "loss": 0.3929, "lr": 4.36720069024962e-06, "epoch": 2.320855614973262, "percentage": 23.21, "elapsed_time": "2:55:15", "remaining_time": "9:39:53"} -{"current_steps": 435, "total_steps": 1870, "loss": 0.2747, "lr": 4.364405237266816e-06, "epoch": 2.3262032085561497, "percentage": 23.26, "elapsed_time": "2:55:18", "remaining_time": "9:38:19"} -{"current_steps": 436, "total_steps": 1870, "loss": 0.3981, "lr": 4.361604522212517e-06, "epoch": 2.3315508021390374, "percentage": 23.32, "elapsed_time": "2:55:21", "remaining_time": "9:36:45"} -{"current_steps": 437, "total_steps": 1870, "loss": 0.5474, "lr": 4.358798552991424e-06, "epoch": 2.336898395721925, "percentage": 23.37, "elapsed_time": "2:55:24", "remaining_time": "9:35:13"} -{"current_steps": 438, "total_steps": 1870, "loss": 0.3569, "lr": 4.355987337523065e-06, "epoch": 2.342245989304813, "percentage": 23.42, "elapsed_time": "2:55:28", "remaining_time": "9:33:42"} -{"current_steps": 439, "total_steps": 1870, "loss": 0.3298, "lr": 4.353170883741776e-06, "epoch": 2.3475935828877006, "percentage": 23.48, "elapsed_time": "2:55:33", "remaining_time": "9:32:15"} -{"current_steps": 440, "total_steps": 1870, "loss": 0.3982, "lr": 4.350349199596676e-06, "epoch": 2.3529411764705883, "percentage": 23.53, "elapsed_time": "2:55:37", "remaining_time": "9:30:47"} -{"current_steps": 441, "total_steps": 1870, "loss": 0.6064, "lr": 4.3475222930516484e-06, "epoch": 2.358288770053476, "percentage": 23.58, "elapsed_time": "2:55:43", "remaining_time": "9:29:25"} -{"current_steps": 442, "total_steps": 1870, "loss": 0.4624, "lr": 4.3446901720853144e-06, "epoch": 2.3636363636363638, "percentage": 23.64, "elapsed_time": "2:55:47", "remaining_time": "9:27:57"} -{"current_steps": 443, "total_steps": 1870, "loss": 0.4464, "lr": 4.341852844691012e-06, "epoch": 2.3689839572192515, "percentage": 23.69, "elapsed_time": "2:55:49", "remaining_time": "9:26:22"} -{"current_steps": 444, "total_steps": 1870, "loss": 0.4206, "lr": 4.339010318876777e-06, "epoch": 2.374331550802139, "percentage": 23.74, "elapsed_time": "2:55:54", "remaining_time": "9:24:57"} -{"current_steps": 445, "total_steps": 1870, "loss": 0.5911, "lr": 4.336162602665314e-06, "epoch": 2.379679144385027, "percentage": 23.8, "elapsed_time": "2:56:01", "remaining_time": "9:23:40"} -{"current_steps": 446, "total_steps": 1870, "loss": 0.4042, "lr": 4.333309704093977e-06, "epoch": 2.385026737967914, "percentage": 23.85, "elapsed_time": "2:56:05", "remaining_time": "9:22:14"} -{"current_steps": 447, "total_steps": 1870, "loss": 0.202, "lr": 4.330451631214747e-06, "epoch": 2.3903743315508024, "percentage": 23.9, "elapsed_time": "2:56:08", "remaining_time": "9:20:42"} -{"current_steps": 448, "total_steps": 1870, "loss": 0.3801, "lr": 4.3275883920942105e-06, "epoch": 2.3957219251336896, "percentage": 23.96, "elapsed_time": "2:56:09", "remaining_time": "9:19:10"} -{"current_steps": 449, "total_steps": 1870, "loss": 0.4677, "lr": 4.324719994813533e-06, "epoch": 2.4010695187165774, "percentage": 24.01, "elapsed_time": "2:56:16", "remaining_time": "9:17:52"} -{"current_steps": 450, "total_steps": 1870, "loss": 0.4263, "lr": 4.321846447468441e-06, "epoch": 2.406417112299465, "percentage": 24.06, "elapsed_time": "2:56:18", "remaining_time": "9:16:21"} -{"current_steps": 451, "total_steps": 1870, "loss": 0.6155, "lr": 4.318967758169192e-06, "epoch": 2.411764705882353, "percentage": 24.12, "elapsed_time": "2:56:22", "remaining_time": "9:14:56"} -{"current_steps": 452, "total_steps": 1870, "loss": 0.6256, "lr": 4.316083935040561e-06, "epoch": 2.4171122994652405, "percentage": 24.17, "elapsed_time": "2:56:25", "remaining_time": "9:13:27"} -{"current_steps": 453, "total_steps": 1870, "loss": 0.3739, "lr": 4.313194986221809e-06, "epoch": 2.4224598930481283, "percentage": 24.22, "elapsed_time": "2:56:29", "remaining_time": "9:12:03"} -{"current_steps": 454, "total_steps": 1870, "loss": 0.3881, "lr": 4.310300919866666e-06, "epoch": 2.427807486631016, "percentage": 24.28, "elapsed_time": "2:56:35", "remaining_time": "9:10:45"} -{"current_steps": 455, "total_steps": 1870, "loss": 0.6139, "lr": 4.307401744143304e-06, "epoch": 2.4331550802139037, "percentage": 24.33, "elapsed_time": "2:56:38", "remaining_time": "9:09:19"} -{"current_steps": 456, "total_steps": 1870, "loss": 0.2351, "lr": 4.304497467234317e-06, "epoch": 2.4385026737967914, "percentage": 24.39, "elapsed_time": "2:56:39", "remaining_time": "9:07:46"} -{"current_steps": 457, "total_steps": 1870, "loss": 0.4977, "lr": 4.3015880973366955e-06, "epoch": 2.443850267379679, "percentage": 24.44, "elapsed_time": "2:56:40", "remaining_time": "9:06:15"} -{"current_steps": 458, "total_steps": 1870, "loss": 0.5975, "lr": 4.2986736426618045e-06, "epoch": 2.449197860962567, "percentage": 24.49, "elapsed_time": "2:56:42", "remaining_time": "9:04:47"} -{"current_steps": 459, "total_steps": 1870, "loss": 0.2682, "lr": 4.295754111435361e-06, "epoch": 2.4545454545454546, "percentage": 24.55, "elapsed_time": "2:56:44", "remaining_time": "9:03:18"} -{"current_steps": 460, "total_steps": 1870, "loss": 0.4815, "lr": 4.292829511897409e-06, "epoch": 2.4598930481283423, "percentage": 24.6, "elapsed_time": "2:56:50", "remaining_time": "9:02:03"} -{"current_steps": 461, "total_steps": 1870, "loss": 0.4931, "lr": 4.2898998523022985e-06, "epoch": 2.46524064171123, "percentage": 24.65, "elapsed_time": "2:56:52", "remaining_time": "9:00:35"} -{"current_steps": 462, "total_steps": 1870, "loss": 0.3389, "lr": 4.28696514091866e-06, "epoch": 2.4705882352941178, "percentage": 24.71, "elapsed_time": "2:56:53", "remaining_time": "8:59:05"} -{"current_steps": 463, "total_steps": 1870, "loss": 0.6479, "lr": 4.284025386029381e-06, "epoch": 2.4759358288770055, "percentage": 24.76, "elapsed_time": "2:56:55", "remaining_time": "8:57:38"} -{"current_steps": 464, "total_steps": 1870, "loss": 0.4811, "lr": 4.281080595931587e-06, "epoch": 2.481283422459893, "percentage": 24.81, "elapsed_time": "2:56:58", "remaining_time": "8:56:15"} -{"current_steps": 465, "total_steps": 1870, "loss": 0.7251, "lr": 4.27813077893661e-06, "epoch": 2.486631016042781, "percentage": 24.87, "elapsed_time": "2:57:02", "remaining_time": "8:54:56"} -{"current_steps": 466, "total_steps": 1870, "loss": 0.4036, "lr": 4.2751759433699745e-06, "epoch": 2.4919786096256686, "percentage": 24.92, "elapsed_time": "2:57:04", "remaining_time": "8:53:28"} -{"current_steps": 467, "total_steps": 1870, "loss": 0.6547, "lr": 4.2722160975713675e-06, "epoch": 2.497326203208556, "percentage": 24.97, "elapsed_time": "2:57:07", "remaining_time": "8:52:07"} -{"current_steps": 468, "total_steps": 1870, "loss": 0.5507, "lr": 4.269251249894617e-06, "epoch": 2.502673796791444, "percentage": 25.03, "elapsed_time": "2:57:09", "remaining_time": "8:50:44"} -{"current_steps": 469, "total_steps": 1870, "loss": 0.4882, "lr": 4.266281408707667e-06, "epoch": 2.5080213903743314, "percentage": 25.08, "elapsed_time": "2:57:15", "remaining_time": "8:49:31"} -{"current_steps": 470, "total_steps": 1870, "loss": 0.2273, "lr": 4.263306582392556e-06, "epoch": 2.5133689839572195, "percentage": 25.13, "elapsed_time": "2:57:16", "remaining_time": "8:48:04"} -{"current_steps": 471, "total_steps": 1870, "loss": 0.6084, "lr": 4.2603267793453925e-06, "epoch": 2.518716577540107, "percentage": 25.19, "elapsed_time": "2:57:18", "remaining_time": "8:46:39"} -{"current_steps": 472, "total_steps": 1870, "loss": 0.283, "lr": 4.257342007976332e-06, "epoch": 2.5240641711229945, "percentage": 25.24, "elapsed_time": "2:57:22", "remaining_time": "8:45:22"} -{"current_steps": 473, "total_steps": 1870, "loss": 0.2611, "lr": 4.254352276709552e-06, "epoch": 2.5294117647058822, "percentage": 25.29, "elapsed_time": "2:57:24", "remaining_time": "8:43:57"} -{"current_steps": 474, "total_steps": 1870, "loss": 0.7555, "lr": 4.251357593983228e-06, "epoch": 2.53475935828877, "percentage": 25.35, "elapsed_time": "2:57:29", "remaining_time": "8:42:45"} -{"current_steps": 475, "total_steps": 1870, "loss": 0.2796, "lr": 4.24835796824951e-06, "epoch": 2.5401069518716577, "percentage": 25.4, "elapsed_time": "2:57:31", "remaining_time": "8:41:22"} -{"current_steps": 476, "total_steps": 1870, "loss": 0.8278, "lr": 4.245353407974503e-06, "epoch": 2.5454545454545454, "percentage": 25.45, "elapsed_time": "2:57:35", "remaining_time": "8:40:04"} -{"current_steps": 477, "total_steps": 1870, "loss": 0.2478, "lr": 4.242343921638235e-06, "epoch": 2.550802139037433, "percentage": 25.51, "elapsed_time": "2:57:37", "remaining_time": "8:38:42"} -{"current_steps": 478, "total_steps": 1870, "loss": 0.5353, "lr": 4.239329517734636e-06, "epoch": 2.556149732620321, "percentage": 25.56, "elapsed_time": "2:57:40", "remaining_time": "8:37:25"} -{"current_steps": 479, "total_steps": 1870, "loss": 0.3595, "lr": 4.2363102047715205e-06, "epoch": 2.5614973262032086, "percentage": 25.61, "elapsed_time": "2:57:43", "remaining_time": "8:36:05"} -{"current_steps": 480, "total_steps": 1870, "loss": 0.4868, "lr": 4.2332859912705545e-06, "epoch": 2.5668449197860963, "percentage": 25.67, "elapsed_time": "2:57:45", "remaining_time": "8:34:45"} diff --git a/metallama3_8b/limo_filtered_correct/README.md b/metallama3_8b/limo_filtered_correct/README.md deleted file mode 100644 index f612b282cc74311e4c96c0c51046c75a62fb59b7..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/README.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -library_name: transformers -license: other -base_model: meta-llama/Meta-Llama-3-8B-Instruct -tags: -- llama-factory -- full -- generated_from_trainer -model-index: -- name: limo_filtered_correct - results: [] ---- - - - -# limo_filtered_correct - -This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the limo_filtered_correct dataset. - -## Model description - -More information needed - -## Intended uses & limitations - -More information needed - -## Training and evaluation data - -More information needed - -## Training procedure - -### Training hyperparameters - -The following hyperparameters were used during training: -- learning_rate: 5e-06 -- train_batch_size: 1 -- eval_batch_size: 8 -- seed: 42 -- distributed_type: multi-GPU -- num_devices: 4 -- total_train_batch_size: 4 -- total_eval_batch_size: 32 -- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments -- lr_scheduler_type: cosine -- num_epochs: 10 - -### Training results - - - -### Framework versions - -- Transformers 4.55.0 -- Pytorch 2.5.1+cu124 -- Datasets 3.6.0 -- Tokenizers 0.21.1 diff --git a/metallama3_8b/limo_filtered_correct/all_results.json b/metallama3_8b/limo_filtered_correct/all_results.json deleted file mode 100644 index e1a6b96221a02b32dc2712f312419265a6f74078..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/all_results.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "epoch": 10.0, - "total_flos": 4.036761107572982e+17, - "train_loss": 0.23243108529037226, - "train_runtime": 7070.6548, - "train_samples_per_second": 0.921, - "train_steps_per_second": 0.231 -} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/chat_template.jinja b/metallama3_8b/limo_filtered_correct/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-1141/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1141/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1141/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00001-of-00007.safetensors deleted file mode 100644 index 19a54fef447d0a25953ccbf6fc8e5781d0f232ca..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:220f3b9a643f709c2492ff989a76c1d23e685f58f3c86f9fd78dc18a3ba9717c -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00002-of-00007.safetensors deleted file mode 100644 index 74fe1431f392d2139fae62a6b1c8a64b34514012..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:debbc3c49794a100db11497d2e9b8a9d5b44dea8e0ec63e313fe61c8e1538a98 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00003-of-00007.safetensors deleted file mode 100644 index 191b915268886c2ff00ef1e0e8d345beb10b3d4b..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:54fb827ef4bf6934d39c12d38ff22e66ea5ba45f8e686f68f1f06a73c1fad26e -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00004-of-00007.safetensors deleted file mode 100644 index a8bd874dfcfafff2f4391d72a23dc4eabc46ee4c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:849ec34de5041e2d465487a2f40bdab10bfcc5f31588e8bb046320319c536b1d -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00005-of-00007.safetensors deleted file mode 100644 index f64b584aeca81f769751277a990483c62727ed2e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:65c8ea7c071831603660502c8773543cf6cb87bde3b93d862baa1278058e46bd -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00006-of-00007.safetensors deleted file mode 100644 index 6a598401e3d95b1d80ff12ef9d883cbcc6182706..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8290ca18577228759dce85ff927e16394116a43fcad7a680f4ad27d0e8b81ee6 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00007-of-00007.safetensors deleted file mode 100644 index aec91b45518352110bedfcdcc1f462db02f97e69..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4921048b8b437837a24d634f003258f3ce6e572b367733221da93ddc26b5678d -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-1141/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_0.pth deleted file mode 100644 index be2e24cc9d9ef8857272cec1451c810e205ec4e9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef002048764051a71fb00f8f978e9ec32b780dc850bdb059af362cc56494234b -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_1.pth deleted file mode 100644 index efcf4dd2e74596ac28af81f9f8bd0be9a807deb3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37194a6d48612e1a46a2d5d317ead97c70d9fc4569b0118fcd5f84c3dc9daa5a -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_2.pth deleted file mode 100644 index 4c9222e37d4e9d1745c0e126e0fe0c4a348e298d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:17c179483659a784aa1ace2427daff48c556a6bcc3c330e6f3274e4dc95e4b49 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_3.pth deleted file mode 100644 index 7821bf0f5f0621fd0159152432f0a7bc66aa6823..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b56857c9b117629f35af2c3d64f522d33a9d8aa94faa81ec6956380a895118c4 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-1141/scheduler.pt deleted file mode 100644 index 3b5efd501ee5ae89b16f36192023e86c99961bd1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:11e37553b285d0f04753364095a261611dc467ed4a4f9554324d06fdaeaa7ddc -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-1141/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-1141/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1141/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1141/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-1141/trainer_state.json deleted file mode 100644 index be98996ada7a73bb6a5783d649ed2a1bef7a994e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1141/trainer_state.json +++ /dev/null @@ -1,8021 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 7.0, - "eval_steps": 500, - "global_step": 1141, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - }, - { - "epoch": 3.0061349693251533, - "grad_norm": 5.331607818603516, - "learning_rate": 3.969463130731183e-06, - "loss": 0.3267, - "step": 490 - }, - { - "epoch": 3.0122699386503067, - "grad_norm": 3.453650712966919, - "learning_rate": 3.965562241492401e-06, - "loss": 0.2719, - "step": 491 - }, - { - "epoch": 3.01840490797546, - "grad_norm": 3.232313632965088, - "learning_rate": 3.9616559081213335e-06, - "loss": 0.1825, - "step": 492 - }, - { - "epoch": 3.0245398773006134, - "grad_norm": 3.4860260486602783, - "learning_rate": 3.957744145128858e-06, - "loss": 0.1854, - "step": 493 - }, - { - "epoch": 3.0306748466257667, - "grad_norm": 3.4357805252075195, - "learning_rate": 3.953826967046021e-06, - "loss": 0.2224, - "step": 494 - }, - { - "epoch": 3.03680981595092, - "grad_norm": 4.557503700256348, - "learning_rate": 3.9499043884239894e-06, - "loss": 0.349, - "step": 495 - }, - { - "epoch": 3.042944785276074, - "grad_norm": 4.685214042663574, - "learning_rate": 3.945976423833987e-06, - "loss": 0.175, - "step": 496 - }, - { - "epoch": 3.049079754601227, - "grad_norm": 3.7430171966552734, - "learning_rate": 3.942043087867244e-06, - "loss": 0.2773, - "step": 497 - }, - { - "epoch": 3.0552147239263805, - "grad_norm": 3.756450653076172, - "learning_rate": 3.938104395134947e-06, - "loss": 0.4445, - "step": 498 - }, - { - "epoch": 3.061349693251534, - "grad_norm": 4.049175262451172, - "learning_rate": 3.9341603602681805e-06, - "loss": 0.3046, - "step": 499 - }, - { - "epoch": 3.067484662576687, - "grad_norm": 3.7689461708068848, - "learning_rate": 3.930210997917871e-06, - "loss": 0.2544, - "step": 500 - }, - { - "epoch": 3.0736196319018405, - "grad_norm": 4.027602195739746, - "learning_rate": 3.92625632275474e-06, - "loss": 0.3154, - "step": 501 - }, - { - "epoch": 3.079754601226994, - "grad_norm": 2.8449292182922363, - "learning_rate": 3.922296349469239e-06, - "loss": 0.2804, - "step": 502 - }, - { - "epoch": 3.085889570552147, - "grad_norm": 2.9555234909057617, - "learning_rate": 3.918331092771505e-06, - "loss": 0.2393, - "step": 503 - }, - { - "epoch": 3.0920245398773005, - "grad_norm": 2.621042013168335, - "learning_rate": 3.914360567391296e-06, - "loss": 0.1403, - "step": 504 - }, - { - "epoch": 3.098159509202454, - "grad_norm": 3.2348620891571045, - "learning_rate": 3.910384788077949e-06, - "loss": 0.1537, - "step": 505 - }, - { - "epoch": 3.104294478527607, - "grad_norm": 3.030179977416992, - "learning_rate": 3.906403769600311e-06, - "loss": 0.2921, - "step": 506 - }, - { - "epoch": 3.1104294478527605, - "grad_norm": 3.146428346633911, - "learning_rate": 3.902417526746694e-06, - "loss": 0.2036, - "step": 507 - }, - { - "epoch": 3.116564417177914, - "grad_norm": 3.6201512813568115, - "learning_rate": 3.898426074324818e-06, - "loss": 0.2655, - "step": 508 - }, - { - "epoch": 3.1226993865030677, - "grad_norm": 3.7674012184143066, - "learning_rate": 3.8944294271617524e-06, - "loss": 0.3938, - "step": 509 - }, - { - "epoch": 3.128834355828221, - "grad_norm": 4.54722785949707, - "learning_rate": 3.890427600103865e-06, - "loss": 0.3051, - "step": 510 - }, - { - "epoch": 3.1349693251533743, - "grad_norm": 4.228236675262451, - "learning_rate": 3.886420608016767e-06, - "loss": 0.3719, - "step": 511 - }, - { - "epoch": 3.1411042944785277, - "grad_norm": 4.355110168457031, - "learning_rate": 3.882408465785252e-06, - "loss": 0.1863, - "step": 512 - }, - { - "epoch": 3.147239263803681, - "grad_norm": 3.451460838317871, - "learning_rate": 3.878391188313249e-06, - "loss": 0.1479, - "step": 513 - }, - { - "epoch": 3.1533742331288344, - "grad_norm": 4.395524501800537, - "learning_rate": 3.87436879052376e-06, - "loss": 0.238, - "step": 514 - }, - { - "epoch": 3.1595092024539877, - "grad_norm": 2.940717935562134, - "learning_rate": 3.870341287358809e-06, - "loss": 0.2069, - "step": 515 - }, - { - "epoch": 3.165644171779141, - "grad_norm": 2.5817320346832275, - "learning_rate": 3.8663086937793845e-06, - "loss": 0.1189, - "step": 516 - }, - { - "epoch": 3.1717791411042944, - "grad_norm": 3.9863343238830566, - "learning_rate": 3.862271024765385e-06, - "loss": 0.3434, - "step": 517 - }, - { - "epoch": 3.1779141104294477, - "grad_norm": 3.609004259109497, - "learning_rate": 3.8582282953155626e-06, - "loss": 0.1602, - "step": 518 - }, - { - "epoch": 3.184049079754601, - "grad_norm": 3.207533121109009, - "learning_rate": 3.854180520447465e-06, - "loss": 0.3452, - "step": 519 - }, - { - "epoch": 3.190184049079755, - "grad_norm": 3.593388795852661, - "learning_rate": 3.850127715197387e-06, - "loss": 0.2832, - "step": 520 - }, - { - "epoch": 3.196319018404908, - "grad_norm": 3.409064531326294, - "learning_rate": 3.846069894620306e-06, - "loss": 0.1481, - "step": 521 - }, - { - "epoch": 3.2024539877300615, - "grad_norm": 3.461498737335205, - "learning_rate": 3.84200707378983e-06, - "loss": 0.1283, - "step": 522 - }, - { - "epoch": 3.208588957055215, - "grad_norm": 3.708467483520508, - "learning_rate": 3.8379392677981434e-06, - "loss": 0.2468, - "step": 523 - }, - { - "epoch": 3.214723926380368, - "grad_norm": 2.802381753921509, - "learning_rate": 3.833866491755947e-06, - "loss": 0.2685, - "step": 524 - }, - { - "epoch": 3.2208588957055215, - "grad_norm": 3.0787744522094727, - "learning_rate": 3.8297887607924044e-06, - "loss": 0.2595, - "step": 525 - }, - { - "epoch": 3.226993865030675, - "grad_norm": 3.3952548503875732, - "learning_rate": 3.825706090055088e-06, - "loss": 0.4099, - "step": 526 - }, - { - "epoch": 3.233128834355828, - "grad_norm": 3.3497085571289062, - "learning_rate": 3.821618494709916e-06, - "loss": 0.287, - "step": 527 - }, - { - "epoch": 3.2392638036809815, - "grad_norm": 4.050611972808838, - "learning_rate": 3.817525989941102e-06, - "loss": 0.2369, - "step": 528 - }, - { - "epoch": 3.245398773006135, - "grad_norm": 2.87642240524292, - "learning_rate": 3.8134285909510972e-06, - "loss": 0.2751, - "step": 529 - }, - { - "epoch": 3.2515337423312882, - "grad_norm": 3.821941614151001, - "learning_rate": 3.8093263129605305e-06, - "loss": 0.2363, - "step": 530 - }, - { - "epoch": 3.2576687116564416, - "grad_norm": 2.8066117763519287, - "learning_rate": 3.80521917120816e-06, - "loss": 0.094, - "step": 531 - }, - { - "epoch": 3.263803680981595, - "grad_norm": 3.849768877029419, - "learning_rate": 3.801107180950806e-06, - "loss": 0.4117, - "step": 532 - }, - { - "epoch": 3.2699386503067487, - "grad_norm": 2.4161250591278076, - "learning_rate": 3.7969903574633028e-06, - "loss": 0.1183, - "step": 533 - }, - { - "epoch": 3.276073619631902, - "grad_norm": 3.6743111610412598, - "learning_rate": 3.792868716038437e-06, - "loss": 0.2296, - "step": 534 - }, - { - "epoch": 3.2822085889570554, - "grad_norm": 4.378123760223389, - "learning_rate": 3.7887422719868937e-06, - "loss": 0.2678, - "step": 535 - }, - { - "epoch": 3.2883435582822087, - "grad_norm": 4.816481590270996, - "learning_rate": 3.784611040637198e-06, - "loss": 0.4887, - "step": 536 - }, - { - "epoch": 3.294478527607362, - "grad_norm": 3.5712430477142334, - "learning_rate": 3.7804750373356576e-06, - "loss": 0.3827, - "step": 537 - }, - { - "epoch": 3.3006134969325154, - "grad_norm": 3.6877355575561523, - "learning_rate": 3.776334277446307e-06, - "loss": 0.3233, - "step": 538 - }, - { - "epoch": 3.3067484662576687, - "grad_norm": 3.442706346511841, - "learning_rate": 3.7721887763508512e-06, - "loss": 0.1256, - "step": 539 - }, - { - "epoch": 3.312883435582822, - "grad_norm": 3.9265615940093994, - "learning_rate": 3.7680385494486053e-06, - "loss": 0.3845, - "step": 540 - }, - { - "epoch": 3.3190184049079754, - "grad_norm": 3.5030126571655273, - "learning_rate": 3.7638836121564414e-06, - "loss": 0.2905, - "step": 541 - }, - { - "epoch": 3.3251533742331287, - "grad_norm": 3.6685378551483154, - "learning_rate": 3.7597239799087283e-06, - "loss": 0.3561, - "step": 542 - }, - { - "epoch": 3.331288343558282, - "grad_norm": 3.8484046459198, - "learning_rate": 3.7555596681572736e-06, - "loss": 0.1157, - "step": 543 - }, - { - "epoch": 3.3374233128834354, - "grad_norm": 3.7977402210235596, - "learning_rate": 3.751390692371272e-06, - "loss": 0.3049, - "step": 544 - }, - { - "epoch": 3.3435582822085887, - "grad_norm": 3.4409852027893066, - "learning_rate": 3.7472170680372398e-06, - "loss": 0.1626, - "step": 545 - }, - { - "epoch": 3.3496932515337425, - "grad_norm": 3.801541328430176, - "learning_rate": 3.7430388106589632e-06, - "loss": 0.2414, - "step": 546 - }, - { - "epoch": 3.355828220858896, - "grad_norm": 4.025203704833984, - "learning_rate": 3.738855935757438e-06, - "loss": 0.3441, - "step": 547 - }, - { - "epoch": 3.361963190184049, - "grad_norm": 4.242798805236816, - "learning_rate": 3.7346684588708135e-06, - "loss": 0.5244, - "step": 548 - }, - { - "epoch": 3.3680981595092025, - "grad_norm": 3.0516819953918457, - "learning_rate": 3.7304763955543332e-06, - "loss": 0.1984, - "step": 549 - }, - { - "epoch": 3.374233128834356, - "grad_norm": 3.894667625427246, - "learning_rate": 3.726279761380279e-06, - "loss": 0.2715, - "step": 550 - }, - { - "epoch": 3.3803680981595092, - "grad_norm": 3.171208143234253, - "learning_rate": 3.72207857193791e-06, - "loss": 0.1537, - "step": 551 - }, - { - "epoch": 3.3865030674846626, - "grad_norm": 4.344860553741455, - "learning_rate": 3.7178728428334092e-06, - "loss": 0.2388, - "step": 552 - }, - { - "epoch": 3.392638036809816, - "grad_norm": 2.766317367553711, - "learning_rate": 3.7136625896898226e-06, - "loss": 0.1726, - "step": 553 - }, - { - "epoch": 3.3987730061349692, - "grad_norm": 3.550662040710449, - "learning_rate": 3.7094478281470003e-06, - "loss": 0.2942, - "step": 554 - }, - { - "epoch": 3.4049079754601226, - "grad_norm": 3.4576945304870605, - "learning_rate": 3.7052285738615412e-06, - "loss": 0.1665, - "step": 555 - }, - { - "epoch": 3.411042944785276, - "grad_norm": 4.026793003082275, - "learning_rate": 3.7010048425067317e-06, - "loss": 0.3954, - "step": 556 - }, - { - "epoch": 3.4171779141104293, - "grad_norm": 4.600133419036865, - "learning_rate": 3.696776649772492e-06, - "loss": 0.3207, - "step": 557 - }, - { - "epoch": 3.4233128834355826, - "grad_norm": 4.747331142425537, - "learning_rate": 3.692544011365312e-06, - "loss": 0.1325, - "step": 558 - }, - { - "epoch": 3.4294478527607364, - "grad_norm": 3.781464099884033, - "learning_rate": 3.6883069430081986e-06, - "loss": 0.1644, - "step": 559 - }, - { - "epoch": 3.4355828220858897, - "grad_norm": 2.905986785888672, - "learning_rate": 3.6840654604406135e-06, - "loss": 0.2469, - "step": 560 - }, - { - "epoch": 3.441717791411043, - "grad_norm": 2.3747711181640625, - "learning_rate": 3.679819579418414e-06, - "loss": 0.1146, - "step": 561 - }, - { - "epoch": 3.4478527607361964, - "grad_norm": 3.2683632373809814, - "learning_rate": 3.6755693157137995e-06, - "loss": 0.3236, - "step": 562 - }, - { - "epoch": 3.4539877300613497, - "grad_norm": 3.7750496864318848, - "learning_rate": 3.6713146851152487e-06, - "loss": 0.399, - "step": 563 - }, - { - "epoch": 3.460122699386503, - "grad_norm": 3.3912384510040283, - "learning_rate": 3.667055703427461e-06, - "loss": 0.1259, - "step": 564 - }, - { - "epoch": 3.4662576687116564, - "grad_norm": 3.0224430561065674, - "learning_rate": 3.6627923864713e-06, - "loss": 0.1835, - "step": 565 - }, - { - "epoch": 3.4723926380368098, - "grad_norm": 3.642258405685425, - "learning_rate": 3.658524750083733e-06, - "loss": 0.2763, - "step": 566 - }, - { - "epoch": 3.478527607361963, - "grad_norm": 3.409890651702881, - "learning_rate": 3.654252810117773e-06, - "loss": 0.2496, - "step": 567 - }, - { - "epoch": 3.4846625766871164, - "grad_norm": 3.0416476726531982, - "learning_rate": 3.6499765824424195e-06, - "loss": 0.1287, - "step": 568 - }, - { - "epoch": 3.4907975460122698, - "grad_norm": 3.1963987350463867, - "learning_rate": 3.6456960829425987e-06, - "loss": 0.1747, - "step": 569 - }, - { - "epoch": 3.4969325153374236, - "grad_norm": 3.198448657989502, - "learning_rate": 3.641411327519107e-06, - "loss": 0.1913, - "step": 570 - }, - { - "epoch": 3.5030674846625764, - "grad_norm": 3.7023441791534424, - "learning_rate": 3.6371223320885492e-06, - "loss": 0.3224, - "step": 571 - }, - { - "epoch": 3.5092024539877302, - "grad_norm": 4.54288387298584, - "learning_rate": 3.6328291125832803e-06, - "loss": 0.2364, - "step": 572 - }, - { - "epoch": 3.5153374233128836, - "grad_norm": 3.5064890384674072, - "learning_rate": 3.628531684951347e-06, - "loss": 0.2552, - "step": 573 - }, - { - "epoch": 3.521472392638037, - "grad_norm": 3.987583875656128, - "learning_rate": 3.6242300651564276e-06, - "loss": 0.3232, - "step": 574 - }, - { - "epoch": 3.5276073619631902, - "grad_norm": 3.179642915725708, - "learning_rate": 3.6199242691777745e-06, - "loss": 0.32, - "step": 575 - }, - { - "epoch": 3.5337423312883436, - "grad_norm": 3.3078157901763916, - "learning_rate": 3.6156143130101516e-06, - "loss": 0.2922, - "step": 576 - }, - { - "epoch": 3.539877300613497, - "grad_norm": 3.1628613471984863, - "learning_rate": 3.6113002126637765e-06, - "loss": 0.2005, - "step": 577 - }, - { - "epoch": 3.5460122699386503, - "grad_norm": 3.4515540599823, - "learning_rate": 3.606981984164263e-06, - "loss": 0.2138, - "step": 578 - }, - { - "epoch": 3.5521472392638036, - "grad_norm": 5.132473945617676, - "learning_rate": 3.6026596435525578e-06, - "loss": 0.4382, - "step": 579 - }, - { - "epoch": 3.558282208588957, - "grad_norm": 3.397614002227783, - "learning_rate": 3.5983332068848855e-06, - "loss": 0.3326, - "step": 580 - }, - { - "epoch": 3.5644171779141103, - "grad_norm": 4.79497766494751, - "learning_rate": 3.5940026902326825e-06, - "loss": 0.4748, - "step": 581 - }, - { - "epoch": 3.5705521472392636, - "grad_norm": 3.7675018310546875, - "learning_rate": 3.5896681096825446e-06, - "loss": 0.2692, - "step": 582 - }, - { - "epoch": 3.5766871165644174, - "grad_norm": 3.0637521743774414, - "learning_rate": 3.5853294813361614e-06, - "loss": 0.3658, - "step": 583 - }, - { - "epoch": 3.5828220858895703, - "grad_norm": 2.8949790000915527, - "learning_rate": 3.5809868213102623e-06, - "loss": 0.1661, - "step": 584 - }, - { - "epoch": 3.588957055214724, - "grad_norm": 3.163419246673584, - "learning_rate": 3.5766401457365485e-06, - "loss": 0.1233, - "step": 585 - }, - { - "epoch": 3.5950920245398774, - "grad_norm": 3.1787965297698975, - "learning_rate": 3.5722894707616417e-06, - "loss": 0.278, - "step": 586 - }, - { - "epoch": 3.6012269938650308, - "grad_norm": 2.9397857189178467, - "learning_rate": 3.5679348125470175e-06, - "loss": 0.1541, - "step": 587 - }, - { - "epoch": 3.607361963190184, - "grad_norm": 3.2690396308898926, - "learning_rate": 3.56357618726895e-06, - "loss": 0.1575, - "step": 588 - }, - { - "epoch": 3.6134969325153374, - "grad_norm": 5.444014072418213, - "learning_rate": 3.5592136111184483e-06, - "loss": 0.8079, - "step": 589 - }, - { - "epoch": 3.6196319018404908, - "grad_norm": 3.1688313484191895, - "learning_rate": 3.554847100301199e-06, - "loss": 0.341, - "step": 590 - }, - { - "epoch": 3.625766871165644, - "grad_norm": 2.469212532043457, - "learning_rate": 3.550476671037505e-06, - "loss": 0.1625, - "step": 591 - }, - { - "epoch": 3.6319018404907975, - "grad_norm": 3.3956527709960938, - "learning_rate": 3.546102339562223e-06, - "loss": 0.199, - "step": 592 - }, - { - "epoch": 3.638036809815951, - "grad_norm": 2.7287702560424805, - "learning_rate": 3.5417241221247078e-06, - "loss": 0.1493, - "step": 593 - }, - { - "epoch": 3.644171779141104, - "grad_norm": 3.5046865940093994, - "learning_rate": 3.5373420349887477e-06, - "loss": 0.2765, - "step": 594 - }, - { - "epoch": 3.6503067484662575, - "grad_norm": 3.121476650238037, - "learning_rate": 3.5329560944325065e-06, - "loss": 0.2833, - "step": 595 - }, - { - "epoch": 3.6564417177914113, - "grad_norm": 3.276463270187378, - "learning_rate": 3.528566316748462e-06, - "loss": 0.1237, - "step": 596 - }, - { - "epoch": 3.662576687116564, - "grad_norm": 3.382840633392334, - "learning_rate": 3.524172718243347e-06, - "loss": 0.1599, - "step": 597 - }, - { - "epoch": 3.668711656441718, - "grad_norm": 4.801311492919922, - "learning_rate": 3.5197753152380854e-06, - "loss": 0.2997, - "step": 598 - }, - { - "epoch": 3.6748466257668713, - "grad_norm": 4.117336273193359, - "learning_rate": 3.515374124067736e-06, - "loss": 0.2021, - "step": 599 - }, - { - "epoch": 3.6809815950920246, - "grad_norm": 3.611438035964966, - "learning_rate": 3.5109691610814263e-06, - "loss": 0.1726, - "step": 600 - }, - { - "epoch": 3.687116564417178, - "grad_norm": 4.5179972648620605, - "learning_rate": 3.5065604426422995e-06, - "loss": 0.1377, - "step": 601 - }, - { - "epoch": 3.6932515337423313, - "grad_norm": 3.561061382293701, - "learning_rate": 3.502147985127445e-06, - "loss": 0.1497, - "step": 602 - }, - { - "epoch": 3.6993865030674846, - "grad_norm": 3.3497917652130127, - "learning_rate": 3.4977318049278443e-06, - "loss": 0.1589, - "step": 603 - }, - { - "epoch": 3.705521472392638, - "grad_norm": 3.2725470066070557, - "learning_rate": 3.4933119184483065e-06, - "loss": 0.1364, - "step": 604 - }, - { - "epoch": 3.7116564417177913, - "grad_norm": 3.228956460952759, - "learning_rate": 3.4888883421074076e-06, - "loss": 0.177, - "step": 605 - }, - { - "epoch": 3.7177914110429446, - "grad_norm": 3.7648911476135254, - "learning_rate": 3.484461092337434e-06, - "loss": 0.122, - "step": 606 - }, - { - "epoch": 3.7239263803680984, - "grad_norm": 3.5322585105895996, - "learning_rate": 3.4800301855843137e-06, - "loss": 0.2664, - "step": 607 - }, - { - "epoch": 3.7300613496932513, - "grad_norm": 2.951073169708252, - "learning_rate": 3.4755956383075613e-06, - "loss": 0.12, - "step": 608 - }, - { - "epoch": 3.736196319018405, - "grad_norm": 3.0577664375305176, - "learning_rate": 3.471157466980214e-06, - "loss": 0.3926, - "step": 609 - }, - { - "epoch": 3.7423312883435584, - "grad_norm": 4.089846134185791, - "learning_rate": 3.466715688088772e-06, - "loss": 0.6233, - "step": 610 - }, - { - "epoch": 3.7484662576687118, - "grad_norm": 3.081340789794922, - "learning_rate": 3.462270318133136e-06, - "loss": 0.2456, - "step": 611 - }, - { - "epoch": 3.754601226993865, - "grad_norm": 3.034712553024292, - "learning_rate": 3.4578213736265474e-06, - "loss": 0.2683, - "step": 612 - }, - { - "epoch": 3.7607361963190185, - "grad_norm": 3.459815740585327, - "learning_rate": 3.4533688710955255e-06, - "loss": 0.3796, - "step": 613 - }, - { - "epoch": 3.766871165644172, - "grad_norm": 3.523737907409668, - "learning_rate": 3.448912827079805e-06, - "loss": 0.3326, - "step": 614 - }, - { - "epoch": 3.773006134969325, - "grad_norm": 3.333219289779663, - "learning_rate": 3.4444532581322793e-06, - "loss": 0.206, - "step": 615 - }, - { - "epoch": 3.7791411042944785, - "grad_norm": 3.582387685775757, - "learning_rate": 3.4399901808189327e-06, - "loss": 0.244, - "step": 616 - }, - { - "epoch": 3.785276073619632, - "grad_norm": 3.4887266159057617, - "learning_rate": 3.435523611718785e-06, - "loss": 0.1796, - "step": 617 - }, - { - "epoch": 3.791411042944785, - "grad_norm": 4.89408016204834, - "learning_rate": 3.4310535674238242e-06, - "loss": 0.188, - "step": 618 - }, - { - "epoch": 3.7975460122699385, - "grad_norm": 4.338910102844238, - "learning_rate": 3.42658006453895e-06, - "loss": 0.3039, - "step": 619 - }, - { - "epoch": 3.8036809815950923, - "grad_norm": 4.107708930969238, - "learning_rate": 3.4221031196819083e-06, - "loss": 0.3383, - "step": 620 - }, - { - "epoch": 3.809815950920245, - "grad_norm": 3.698777675628662, - "learning_rate": 3.4176227494832305e-06, - "loss": 0.1721, - "step": 621 - }, - { - "epoch": 3.815950920245399, - "grad_norm": 2.6659226417541504, - "learning_rate": 3.413138970586174e-06, - "loss": 0.2211, - "step": 622 - }, - { - "epoch": 3.8220858895705523, - "grad_norm": 3.2398436069488525, - "learning_rate": 3.4086517996466574e-06, - "loss": 0.1871, - "step": 623 - }, - { - "epoch": 3.8282208588957056, - "grad_norm": 4.9128804206848145, - "learning_rate": 3.404161253333199e-06, - "loss": 0.3874, - "step": 624 - }, - { - "epoch": 3.834355828220859, - "grad_norm": 3.508789300918579, - "learning_rate": 3.3996673483268573e-06, - "loss": 0.1739, - "step": 625 - }, - { - "epoch": 3.8404907975460123, - "grad_norm": 3.3016927242279053, - "learning_rate": 3.3951701013211665e-06, - "loss": 0.274, - "step": 626 - }, - { - "epoch": 3.8466257668711656, - "grad_norm": 3.8941333293914795, - "learning_rate": 3.3906695290220736e-06, - "loss": 0.3568, - "step": 627 - }, - { - "epoch": 3.852760736196319, - "grad_norm": 3.512354850769043, - "learning_rate": 3.3861656481478816e-06, - "loss": 0.157, - "step": 628 - }, - { - "epoch": 3.8588957055214723, - "grad_norm": 3.482649326324463, - "learning_rate": 3.3816584754291814e-06, - "loss": 0.1218, - "step": 629 - }, - { - "epoch": 3.8650306748466257, - "grad_norm": 3.1490275859832764, - "learning_rate": 3.377148027608793e-06, - "loss": 0.2234, - "step": 630 - }, - { - "epoch": 3.871165644171779, - "grad_norm": 3.2172653675079346, - "learning_rate": 3.3726343214417023e-06, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 3.8773006134969323, - "grad_norm": 4.167707443237305, - "learning_rate": 3.3681173736949984e-06, - "loss": 0.1384, - "step": 632 - }, - { - "epoch": 3.883435582822086, - "grad_norm": 3.4743919372558594, - "learning_rate": 3.3635972011478134e-06, - "loss": 0.3807, - "step": 633 - }, - { - "epoch": 3.889570552147239, - "grad_norm": 3.6892173290252686, - "learning_rate": 3.3590738205912566e-06, - "loss": 0.194, - "step": 634 - }, - { - "epoch": 3.895705521472393, - "grad_norm": 3.262967824935913, - "learning_rate": 3.354547248828356e-06, - "loss": 0.202, - "step": 635 - }, - { - "epoch": 3.901840490797546, - "grad_norm": 3.8871562480926514, - "learning_rate": 3.3500175026739916e-06, - "loss": 0.2471, - "step": 636 - }, - { - "epoch": 3.9079754601226995, - "grad_norm": 3.5097084045410156, - "learning_rate": 3.3454845989548385e-06, - "loss": 0.1112, - "step": 637 - }, - { - "epoch": 3.914110429447853, - "grad_norm": 4.163944721221924, - "learning_rate": 3.3409485545092995e-06, - "loss": 0.3368, - "step": 638 - }, - { - "epoch": 3.920245398773006, - "grad_norm": 3.6405045986175537, - "learning_rate": 3.336409386187444e-06, - "loss": 0.1863, - "step": 639 - }, - { - "epoch": 3.9263803680981595, - "grad_norm": 3.2477526664733887, - "learning_rate": 3.331867110850946e-06, - "loss": 0.1491, - "step": 640 - }, - { - "epoch": 3.932515337423313, - "grad_norm": 3.933753490447998, - "learning_rate": 3.327321745373021e-06, - "loss": 0.2484, - "step": 641 - }, - { - "epoch": 3.938650306748466, - "grad_norm": 3.2475059032440186, - "learning_rate": 3.322773306638364e-06, - "loss": 0.2126, - "step": 642 - }, - { - "epoch": 3.9447852760736195, - "grad_norm": 2.628467321395874, - "learning_rate": 3.318221811543086e-06, - "loss": 0.1649, - "step": 643 - }, - { - "epoch": 3.950920245398773, - "grad_norm": 3.2612411975860596, - "learning_rate": 3.313667276994651e-06, - "loss": 0.1442, - "step": 644 - }, - { - "epoch": 3.957055214723926, - "grad_norm": 3.8058395385742188, - "learning_rate": 3.309109719911814e-06, - "loss": 0.359, - "step": 645 - }, - { - "epoch": 3.96319018404908, - "grad_norm": 3.3450071811676025, - "learning_rate": 3.304549157224558e-06, - "loss": 0.4042, - "step": 646 - }, - { - "epoch": 3.969325153374233, - "grad_norm": 3.079601287841797, - "learning_rate": 3.299985605874031e-06, - "loss": 0.1699, - "step": 647 - }, - { - "epoch": 3.9754601226993866, - "grad_norm": 3.8963980674743652, - "learning_rate": 3.295419082812483e-06, - "loss": 0.1888, - "step": 648 - }, - { - "epoch": 3.98159509202454, - "grad_norm": 3.307405948638916, - "learning_rate": 3.2908496050032024e-06, - "loss": 0.2824, - "step": 649 - }, - { - "epoch": 3.9877300613496933, - "grad_norm": 3.227478265762329, - "learning_rate": 3.2862771894204544e-06, - "loss": 0.3038, - "step": 650 - }, - { - "epoch": 3.9938650306748467, - "grad_norm": 4.046506881713867, - "learning_rate": 3.2817018530494164e-06, - "loss": 0.3266, - "step": 651 - }, - { - "epoch": 4.0, - "grad_norm": 7.775874614715576, - "learning_rate": 3.277123612886116e-06, - "loss": 0.2998, - "step": 652 - }, - { - "epoch": 4.006134969325154, - "grad_norm": 3.146462917327881, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2764, - "step": 653 - }, - { - "epoch": 4.012269938650307, - "grad_norm": 3.0539863109588623, - "learning_rate": 3.2679584892207118e-06, - "loss": 0.1157, - "step": 654 - }, - { - "epoch": 4.0184049079754605, - "grad_norm": 3.634021520614624, - "learning_rate": 3.263371639764343e-06, - "loss": 0.0707, - "step": 655 - }, - { - "epoch": 4.024539877300613, - "grad_norm": 3.3474650382995605, - "learning_rate": 3.2587819546070596e-06, - "loss": 0.1067, - "step": 656 - }, - { - "epoch": 4.030674846625767, - "grad_norm": 4.409244537353516, - "learning_rate": 3.254189450798189e-06, - "loss": 0.0564, - "step": 657 - }, - { - "epoch": 4.03680981595092, - "grad_norm": 3.0446252822875977, - "learning_rate": 3.2495941453975312e-06, - "loss": 0.0535, - "step": 658 - }, - { - "epoch": 4.042944785276074, - "grad_norm": 4.014753818511963, - "learning_rate": 3.2449960554752935e-06, - "loss": 0.1245, - "step": 659 - }, - { - "epoch": 4.049079754601227, - "grad_norm": 3.188062906265259, - "learning_rate": 3.240395198112026e-06, - "loss": 0.0626, - "step": 660 - }, - { - "epoch": 4.0552147239263805, - "grad_norm": 3.006086826324463, - "learning_rate": 3.2357915903985605e-06, - "loss": 0.1198, - "step": 661 - }, - { - "epoch": 4.061349693251533, - "grad_norm": 2.8865551948547363, - "learning_rate": 3.2311852494359423e-06, - "loss": 0.0454, - "step": 662 - }, - { - "epoch": 4.067484662576687, - "grad_norm": 4.2888007164001465, - "learning_rate": 3.226576192335373e-06, - "loss": 0.2064, - "step": 663 - }, - { - "epoch": 4.07361963190184, - "grad_norm": 3.1414525508880615, - "learning_rate": 3.2219644362181436e-06, - "loss": 0.2183, - "step": 664 - }, - { - "epoch": 4.079754601226994, - "grad_norm": 2.556277275085449, - "learning_rate": 3.21734999821557e-06, - "loss": 0.0516, - "step": 665 - }, - { - "epoch": 4.085889570552148, - "grad_norm": 2.698118209838867, - "learning_rate": 3.2127328954689307e-06, - "loss": 0.0613, - "step": 666 - }, - { - "epoch": 4.0920245398773005, - "grad_norm": 2.869919538497925, - "learning_rate": 3.2081131451294025e-06, - "loss": 0.0583, - "step": 667 - }, - { - "epoch": 4.098159509202454, - "grad_norm": 3.8786919116973877, - "learning_rate": 3.2034907643579988e-06, - "loss": 0.0766, - "step": 668 - }, - { - "epoch": 4.104294478527607, - "grad_norm": 4.224637031555176, - "learning_rate": 3.1988657703255043e-06, - "loss": 0.1099, - "step": 669 - }, - { - "epoch": 4.110429447852761, - "grad_norm": 4.671669006347656, - "learning_rate": 3.194238180212409e-06, - "loss": 0.1663, - "step": 670 - }, - { - "epoch": 4.116564417177914, - "grad_norm": 3.2484257221221924, - "learning_rate": 3.1896080112088477e-06, - "loss": 0.0587, - "step": 671 - }, - { - "epoch": 4.122699386503068, - "grad_norm": 2.4808075428009033, - "learning_rate": 3.184975280514536e-06, - "loss": 0.0579, - "step": 672 - }, - { - "epoch": 4.128834355828221, - "grad_norm": 3.7106919288635254, - "learning_rate": 3.1803400053387044e-06, - "loss": 0.1083, - "step": 673 - }, - { - "epoch": 4.134969325153374, - "grad_norm": 3.008970260620117, - "learning_rate": 3.175702202900036e-06, - "loss": 0.1355, - "step": 674 - }, - { - "epoch": 4.141104294478527, - "grad_norm": 3.2640793323516846, - "learning_rate": 3.1710618904266006e-06, - "loss": 0.092, - "step": 675 - }, - { - "epoch": 4.147239263803681, - "grad_norm": 3.08042049407959, - "learning_rate": 3.166419085155793e-06, - "loss": 0.0563, - "step": 676 - }, - { - "epoch": 4.153374233128835, - "grad_norm": 2.993530511856079, - "learning_rate": 3.1617738043342695e-06, - "loss": 0.1773, - "step": 677 - }, - { - "epoch": 4.159509202453988, - "grad_norm": 2.6218204498291016, - "learning_rate": 3.157126065217879e-06, - "loss": 0.0489, - "step": 678 - }, - { - "epoch": 4.1656441717791415, - "grad_norm": 4.3173723220825195, - "learning_rate": 3.152475885071606e-06, - "loss": 0.1333, - "step": 679 - }, - { - "epoch": 4.171779141104294, - "grad_norm": 3.659149408340454, - "learning_rate": 3.147823281169498e-06, - "loss": 0.1501, - "step": 680 - }, - { - "epoch": 4.177914110429448, - "grad_norm": 3.0953338146209717, - "learning_rate": 3.143168270794612e-06, - "loss": 0.1067, - "step": 681 - }, - { - "epoch": 4.184049079754601, - "grad_norm": 3.5693907737731934, - "learning_rate": 3.1385108712389394e-06, - "loss": 0.2499, - "step": 682 - }, - { - "epoch": 4.190184049079755, - "grad_norm": 3.3022868633270264, - "learning_rate": 3.1338510998033484e-06, - "loss": 0.1748, - "step": 683 - }, - { - "epoch": 4.196319018404908, - "grad_norm": 3.7468113899230957, - "learning_rate": 3.129188973797519e-06, - "loss": 0.201, - "step": 684 - }, - { - "epoch": 4.2024539877300615, - "grad_norm": 2.8381078243255615, - "learning_rate": 3.124524510539875e-06, - "loss": 0.0735, - "step": 685 - }, - { - "epoch": 4.208588957055214, - "grad_norm": 2.84706974029541, - "learning_rate": 3.119857727357527e-06, - "loss": 0.1806, - "step": 686 - }, - { - "epoch": 4.214723926380368, - "grad_norm": 3.8130292892456055, - "learning_rate": 3.1151886415861993e-06, - "loss": 0.1811, - "step": 687 - }, - { - "epoch": 4.220858895705521, - "grad_norm": 3.528895378112793, - "learning_rate": 3.1105172705701708e-06, - "loss": 0.1634, - "step": 688 - }, - { - "epoch": 4.226993865030675, - "grad_norm": 5.028727054595947, - "learning_rate": 3.1058436316622103e-06, - "loss": 0.1625, - "step": 689 - }, - { - "epoch": 4.233128834355828, - "grad_norm": 4.606889247894287, - "learning_rate": 3.1011677422235093e-06, - "loss": 0.1791, - "step": 690 - }, - { - "epoch": 4.2392638036809815, - "grad_norm": 3.3620636463165283, - "learning_rate": 3.0964896196236217e-06, - "loss": 0.2233, - "step": 691 - }, - { - "epoch": 4.245398773006135, - "grad_norm": 3.7845852375030518, - "learning_rate": 3.0918092812403954e-06, - "loss": 0.1142, - "step": 692 - }, - { - "epoch": 4.251533742331288, - "grad_norm": 3.1204118728637695, - "learning_rate": 3.0871267444599098e-06, - "loss": 0.096, - "step": 693 - }, - { - "epoch": 4.257668711656442, - "grad_norm": 3.686067819595337, - "learning_rate": 3.0824420266764093e-06, - "loss": 0.2749, - "step": 694 - }, - { - "epoch": 4.263803680981595, - "grad_norm": 3.1680829524993896, - "learning_rate": 3.077755145292243e-06, - "loss": 0.2504, - "step": 695 - }, - { - "epoch": 4.269938650306749, - "grad_norm": 3.3179469108581543, - "learning_rate": 3.0730661177177957e-06, - "loss": 0.1324, - "step": 696 - }, - { - "epoch": 4.276073619631902, - "grad_norm": 3.1186370849609375, - "learning_rate": 3.0683749613714238e-06, - "loss": 0.0691, - "step": 697 - }, - { - "epoch": 4.282208588957055, - "grad_norm": 3.086834192276001, - "learning_rate": 3.063681693679391e-06, - "loss": 0.1026, - "step": 698 - }, - { - "epoch": 4.288343558282208, - "grad_norm": 4.629584312438965, - "learning_rate": 3.0589863320758063e-06, - "loss": 0.2646, - "step": 699 - }, - { - "epoch": 4.294478527607362, - "grad_norm": 3.9641213417053223, - "learning_rate": 3.0542888940025562e-06, - "loss": 0.1711, - "step": 700 - }, - { - "epoch": 4.300613496932515, - "grad_norm": 3.75014328956604, - "learning_rate": 3.0495893969092395e-06, - "loss": 0.0589, - "step": 701 - }, - { - "epoch": 4.306748466257669, - "grad_norm": 3.603290319442749, - "learning_rate": 3.044887858253105e-06, - "loss": 0.2244, - "step": 702 - }, - { - "epoch": 4.3128834355828225, - "grad_norm": 3.79404616355896, - "learning_rate": 3.040184295498984e-06, - "loss": 0.1506, - "step": 703 - }, - { - "epoch": 4.319018404907975, - "grad_norm": 3.0890021324157715, - "learning_rate": 3.035478726119228e-06, - "loss": 0.2343, - "step": 704 - }, - { - "epoch": 4.325153374233129, - "grad_norm": 3.6688191890716553, - "learning_rate": 3.0307711675936426e-06, - "loss": 0.0518, - "step": 705 - }, - { - "epoch": 4.331288343558282, - "grad_norm": 5.1836700439453125, - "learning_rate": 3.0260616374094208e-06, - "loss": 0.2363, - "step": 706 - }, - { - "epoch": 4.337423312883436, - "grad_norm": 2.7123284339904785, - "learning_rate": 3.0213501530610807e-06, - "loss": 0.0848, - "step": 707 - }, - { - "epoch": 4.343558282208589, - "grad_norm": 3.5661890506744385, - "learning_rate": 3.0166367320504005e-06, - "loss": 0.149, - "step": 708 - }, - { - "epoch": 4.3496932515337425, - "grad_norm": 3.6454737186431885, - "learning_rate": 3.0119213918863515e-06, - "loss": 0.1133, - "step": 709 - }, - { - "epoch": 4.355828220858895, - "grad_norm": 3.7534968852996826, - "learning_rate": 3.0072041500850343e-06, - "loss": 0.1358, - "step": 710 - }, - { - "epoch": 4.361963190184049, - "grad_norm": 3.40387225151062, - "learning_rate": 3.0024850241696128e-06, - "loss": 0.0706, - "step": 711 - }, - { - "epoch": 4.368098159509202, - "grad_norm": 3.250471591949463, - "learning_rate": 2.9977640316702512e-06, - "loss": 0.1977, - "step": 712 - }, - { - "epoch": 4.374233128834356, - "grad_norm": 3.417781352996826, - "learning_rate": 2.993041190124047e-06, - "loss": 0.2622, - "step": 713 - }, - { - "epoch": 4.38036809815951, - "grad_norm": 2.628434181213379, - "learning_rate": 2.9883165170749657e-06, - "loss": 0.1487, - "step": 714 - }, - { - "epoch": 4.386503067484663, - "grad_norm": 3.240264892578125, - "learning_rate": 2.9835900300737763e-06, - "loss": 0.0822, - "step": 715 - }, - { - "epoch": 4.392638036809816, - "grad_norm": 6.575517177581787, - "learning_rate": 2.9788617466779884e-06, - "loss": 0.3668, - "step": 716 - }, - { - "epoch": 4.398773006134969, - "grad_norm": 4.699089050292969, - "learning_rate": 2.974131684451781e-06, - "loss": 0.2432, - "step": 717 - }, - { - "epoch": 4.404907975460123, - "grad_norm": 2.9815752506256104, - "learning_rate": 2.9693998609659443e-06, - "loss": 0.0689, - "step": 718 - }, - { - "epoch": 4.411042944785276, - "grad_norm": 4.192755222320557, - "learning_rate": 2.9646662937978082e-06, - "loss": 0.1897, - "step": 719 - }, - { - "epoch": 4.41717791411043, - "grad_norm": 2.9729068279266357, - "learning_rate": 2.9599310005311824e-06, - "loss": 0.0457, - "step": 720 - }, - { - "epoch": 4.423312883435583, - "grad_norm": 4.234438896179199, - "learning_rate": 2.9551939987562866e-06, - "loss": 0.2307, - "step": 721 - }, - { - "epoch": 4.429447852760736, - "grad_norm": 3.3982434272766113, - "learning_rate": 2.950455306069688e-06, - "loss": 0.0637, - "step": 722 - }, - { - "epoch": 4.435582822085889, - "grad_norm": 4.539764404296875, - "learning_rate": 2.9457149400742357e-06, - "loss": 0.1924, - "step": 723 - }, - { - "epoch": 4.441717791411043, - "grad_norm": 4.039684772491455, - "learning_rate": 2.940972918378993e-06, - "loss": 0.1275, - "step": 724 - }, - { - "epoch": 4.447852760736196, - "grad_norm": 4.340360641479492, - "learning_rate": 2.936229258599174e-06, - "loss": 0.123, - "step": 725 - }, - { - "epoch": 4.45398773006135, - "grad_norm": 2.8720109462738037, - "learning_rate": 2.93148397835608e-06, - "loss": 0.0555, - "step": 726 - }, - { - "epoch": 4.460122699386503, - "grad_norm": 4.227811336517334, - "learning_rate": 2.926737095277029e-06, - "loss": 0.0991, - "step": 727 - }, - { - "epoch": 4.466257668711656, - "grad_norm": 2.8079142570495605, - "learning_rate": 2.921988626995295e-06, - "loss": 0.0628, - "step": 728 - }, - { - "epoch": 4.47239263803681, - "grad_norm": 4.195122241973877, - "learning_rate": 2.9172385911500385e-06, - "loss": 0.2333, - "step": 729 - }, - { - "epoch": 4.478527607361963, - "grad_norm": 3.223794460296631, - "learning_rate": 2.9124870053862447e-06, - "loss": 0.1317, - "step": 730 - }, - { - "epoch": 4.484662576687117, - "grad_norm": 3.5533759593963623, - "learning_rate": 2.907733887354657e-06, - "loss": 0.2285, - "step": 731 - }, - { - "epoch": 4.49079754601227, - "grad_norm": 3.535673141479492, - "learning_rate": 2.9029792547117088e-06, - "loss": 0.096, - "step": 732 - }, - { - "epoch": 4.4969325153374236, - "grad_norm": 4.031703948974609, - "learning_rate": 2.898223125119461e-06, - "loss": 0.1505, - "step": 733 - }, - { - "epoch": 4.5030674846625764, - "grad_norm": 2.823413610458374, - "learning_rate": 2.893465516245534e-06, - "loss": 0.0327, - "step": 734 - }, - { - "epoch": 4.50920245398773, - "grad_norm": 3.516738176345825, - "learning_rate": 2.8887064457630453e-06, - "loss": 0.0743, - "step": 735 - }, - { - "epoch": 4.515337423312883, - "grad_norm": 3.5523500442504883, - "learning_rate": 2.8839459313505407e-06, - "loss": 0.1768, - "step": 736 - }, - { - "epoch": 4.521472392638037, - "grad_norm": 3.2433223724365234, - "learning_rate": 2.879183990691929e-06, - "loss": 0.1598, - "step": 737 - }, - { - "epoch": 4.52760736196319, - "grad_norm": 3.0156848430633545, - "learning_rate": 2.8744206414764185e-06, - "loss": 0.0829, - "step": 738 - }, - { - "epoch": 4.533742331288344, - "grad_norm": 4.359529495239258, - "learning_rate": 2.8696559013984488e-06, - "loss": 0.1169, - "step": 739 - }, - { - "epoch": 4.539877300613497, - "grad_norm": 2.3862433433532715, - "learning_rate": 2.8648897881576274e-06, - "loss": 0.0962, - "step": 740 - }, - { - "epoch": 4.54601226993865, - "grad_norm": 2.7100136280059814, - "learning_rate": 2.8601223194586613e-06, - "loss": 0.1204, - "step": 741 - }, - { - "epoch": 4.552147239263804, - "grad_norm": 3.8116140365600586, - "learning_rate": 2.8553535130112935e-06, - "loss": 0.0685, - "step": 742 - }, - { - "epoch": 4.558282208588957, - "grad_norm": 2.9640142917633057, - "learning_rate": 2.850583386530235e-06, - "loss": 0.0692, - "step": 743 - }, - { - "epoch": 4.564417177914111, - "grad_norm": 3.264592170715332, - "learning_rate": 2.8458119577351035e-06, - "loss": 0.2128, - "step": 744 - }, - { - "epoch": 4.570552147239264, - "grad_norm": 3.230497360229492, - "learning_rate": 2.841039244350351e-06, - "loss": 0.2409, - "step": 745 - }, - { - "epoch": 4.576687116564417, - "grad_norm": 4.41513204574585, - "learning_rate": 2.8362652641052024e-06, - "loss": 0.1878, - "step": 746 - }, - { - "epoch": 4.58282208588957, - "grad_norm": 3.047248601913452, - "learning_rate": 2.83149003473359e-06, - "loss": 0.1303, - "step": 747 - }, - { - "epoch": 4.588957055214724, - "grad_norm": 2.399754047393799, - "learning_rate": 2.8267135739740836e-06, - "loss": 0.0577, - "step": 748 - }, - { - "epoch": 4.595092024539877, - "grad_norm": 4.608038425445557, - "learning_rate": 2.8219358995698307e-06, - "loss": 0.2329, - "step": 749 - }, - { - "epoch": 4.601226993865031, - "grad_norm": 3.537644147872925, - "learning_rate": 2.8171570292684846e-06, - "loss": 0.1329, - "step": 750 - }, - { - "epoch": 4.6073619631901845, - "grad_norm": 2.8099827766418457, - "learning_rate": 2.8123769808221407e-06, - "loss": 0.1512, - "step": 751 - }, - { - "epoch": 4.613496932515337, - "grad_norm": 3.3169758319854736, - "learning_rate": 2.8075957719872724e-06, - "loss": 0.1267, - "step": 752 - }, - { - "epoch": 4.61963190184049, - "grad_norm": 3.578435182571411, - "learning_rate": 2.8028134205246633e-06, - "loss": 0.147, - "step": 753 - }, - { - "epoch": 4.625766871165644, - "grad_norm": 3.544437885284424, - "learning_rate": 2.7980299441993415e-06, - "loss": 0.0947, - "step": 754 - }, - { - "epoch": 4.631901840490798, - "grad_norm": 3.798776388168335, - "learning_rate": 2.793245360780512e-06, - "loss": 0.1498, - "step": 755 - }, - { - "epoch": 4.638036809815951, - "grad_norm": 3.634991407394409, - "learning_rate": 2.788459688041495e-06, - "loss": 0.2504, - "step": 756 - }, - { - "epoch": 4.644171779141105, - "grad_norm": 20.123680114746094, - "learning_rate": 2.783672943759655e-06, - "loss": 0.2091, - "step": 757 - }, - { - "epoch": 4.6503067484662575, - "grad_norm": 3.9357221126556396, - "learning_rate": 2.778885145716339e-06, - "loss": 0.2045, - "step": 758 - }, - { - "epoch": 4.656441717791411, - "grad_norm": 3.3035309314727783, - "learning_rate": 2.7740963116968063e-06, - "loss": 0.1416, - "step": 759 - }, - { - "epoch": 4.662576687116564, - "grad_norm": 3.096985101699829, - "learning_rate": 2.7693064594901646e-06, - "loss": 0.0455, - "step": 760 - }, - { - "epoch": 4.668711656441718, - "grad_norm": 2.9855458736419678, - "learning_rate": 2.7645156068893075e-06, - "loss": 0.1496, - "step": 761 - }, - { - "epoch": 4.674846625766871, - "grad_norm": 3.9140093326568604, - "learning_rate": 2.759723771690839e-06, - "loss": 0.2061, - "step": 762 - }, - { - "epoch": 4.680981595092025, - "grad_norm": 3.590569496154785, - "learning_rate": 2.754930971695019e-06, - "loss": 0.1017, - "step": 763 - }, - { - "epoch": 4.6871165644171775, - "grad_norm": 3.527254581451416, - "learning_rate": 2.750137224705687e-06, - "loss": 0.1979, - "step": 764 - }, - { - "epoch": 4.693251533742331, - "grad_norm": 4.198459148406982, - "learning_rate": 2.745342548530202e-06, - "loss": 0.1667, - "step": 765 - }, - { - "epoch": 4.699386503067485, - "grad_norm": 2.0246167182922363, - "learning_rate": 2.7405469609793746e-06, - "loss": 0.0346, - "step": 766 - }, - { - "epoch": 4.705521472392638, - "grad_norm": 3.2045300006866455, - "learning_rate": 2.7357504798674004e-06, - "loss": 0.0596, - "step": 767 - }, - { - "epoch": 4.711656441717792, - "grad_norm": 2.736985921859741, - "learning_rate": 2.730953123011796e-06, - "loss": 0.0384, - "step": 768 - }, - { - "epoch": 4.717791411042945, - "grad_norm": 3.0621395111083984, - "learning_rate": 2.726154908233328e-06, - "loss": 0.0558, - "step": 769 - }, - { - "epoch": 4.723926380368098, - "grad_norm": 3.2280497550964355, - "learning_rate": 2.721355853355953e-06, - "loss": 0.2272, - "step": 770 - }, - { - "epoch": 4.730061349693251, - "grad_norm": 3.342226028442383, - "learning_rate": 2.716555976206748e-06, - "loss": 0.074, - "step": 771 - }, - { - "epoch": 4.736196319018405, - "grad_norm": 4.328624248504639, - "learning_rate": 2.7117552946158415e-06, - "loss": 0.1034, - "step": 772 - }, - { - "epoch": 4.742331288343558, - "grad_norm": 2.980215311050415, - "learning_rate": 2.706953826416353e-06, - "loss": 0.1199, - "step": 773 - }, - { - "epoch": 4.748466257668712, - "grad_norm": 2.622478485107422, - "learning_rate": 2.702151589444324e-06, - "loss": 0.0467, - "step": 774 - }, - { - "epoch": 4.754601226993865, - "grad_norm": 2.9958693981170654, - "learning_rate": 2.6973486015386507e-06, - "loss": 0.143, - "step": 775 - }, - { - "epoch": 4.7607361963190185, - "grad_norm": 4.548511505126953, - "learning_rate": 2.6925448805410197e-06, - "loss": 0.3594, - "step": 776 - }, - { - "epoch": 4.766871165644172, - "grad_norm": 3.3429481983184814, - "learning_rate": 2.6877404442958393e-06, - "loss": 0.1397, - "step": 777 - }, - { - "epoch": 4.773006134969325, - "grad_norm": 2.5820136070251465, - "learning_rate": 2.682935310650177e-06, - "loss": 0.054, - "step": 778 - }, - { - "epoch": 4.779141104294479, - "grad_norm": 4.047626495361328, - "learning_rate": 2.6781294974536886e-06, - "loss": 0.1284, - "step": 779 - }, - { - "epoch": 4.785276073619632, - "grad_norm": 3.0227510929107666, - "learning_rate": 2.673323022558557e-06, - "loss": 0.1441, - "step": 780 - }, - { - "epoch": 4.791411042944786, - "grad_norm": 4.731313705444336, - "learning_rate": 2.6685159038194202e-06, - "loss": 0.2859, - "step": 781 - }, - { - "epoch": 4.7975460122699385, - "grad_norm": 3.880655288696289, - "learning_rate": 2.6637081590933096e-06, - "loss": 0.1524, - "step": 782 - }, - { - "epoch": 4.803680981595092, - "grad_norm": 2.375474452972412, - "learning_rate": 2.6588998062395803e-06, - "loss": 0.0338, - "step": 783 - }, - { - "epoch": 4.809815950920245, - "grad_norm": 3.3587446212768555, - "learning_rate": 2.6540908631198498e-06, - "loss": 0.0755, - "step": 784 - }, - { - "epoch": 4.815950920245399, - "grad_norm": 2.767686367034912, - "learning_rate": 2.6492813475979243e-06, - "loss": 0.0631, - "step": 785 - }, - { - "epoch": 4.822085889570552, - "grad_norm": 3.88670015335083, - "learning_rate": 2.6444712775397397e-06, - "loss": 0.0853, - "step": 786 - }, - { - "epoch": 4.828220858895706, - "grad_norm": 3.543276309967041, - "learning_rate": 2.639660670813288e-06, - "loss": 0.1895, - "step": 787 - }, - { - "epoch": 4.8343558282208585, - "grad_norm": 3.659323215484619, - "learning_rate": 2.6348495452885598e-06, - "loss": 0.1745, - "step": 788 - }, - { - "epoch": 4.840490797546012, - "grad_norm": 3.0955021381378174, - "learning_rate": 2.630037918837468e-06, - "loss": 0.0846, - "step": 789 - }, - { - "epoch": 4.846625766871165, - "grad_norm": 3.4473249912261963, - "learning_rate": 2.6252258093337892e-06, - "loss": 0.0808, - "step": 790 - }, - { - "epoch": 4.852760736196319, - "grad_norm": 3.937120199203491, - "learning_rate": 2.6204132346530936e-06, - "loss": 0.2054, - "step": 791 - }, - { - "epoch": 4.858895705521473, - "grad_norm": 4.052806854248047, - "learning_rate": 2.6156002126726788e-06, - "loss": 0.1679, - "step": 792 - }, - { - "epoch": 4.865030674846626, - "grad_norm": 2.6694889068603516, - "learning_rate": 2.6107867612715043e-06, - "loss": 0.0534, - "step": 793 - }, - { - "epoch": 4.871165644171779, - "grad_norm": 3.594649076461792, - "learning_rate": 2.6059728983301267e-06, - "loss": 0.0899, - "step": 794 - }, - { - "epoch": 4.877300613496932, - "grad_norm": 2.7796030044555664, - "learning_rate": 2.601158641730629e-06, - "loss": 0.0596, - "step": 795 - }, - { - "epoch": 4.883435582822086, - "grad_norm": 4.618961334228516, - "learning_rate": 2.5963440093565567e-06, - "loss": 0.3858, - "step": 796 - }, - { - "epoch": 4.889570552147239, - "grad_norm": 3.0783939361572266, - "learning_rate": 2.5915290190928518e-06, - "loss": 0.12, - "step": 797 - }, - { - "epoch": 4.895705521472393, - "grad_norm": 4.078456878662109, - "learning_rate": 2.586713688825786e-06, - "loss": 0.1278, - "step": 798 - }, - { - "epoch": 4.901840490797546, - "grad_norm": 2.9439120292663574, - "learning_rate": 2.5818980364428935e-06, - "loss": 0.0847, - "step": 799 - }, - { - "epoch": 4.9079754601226995, - "grad_norm": 5.140681743621826, - "learning_rate": 2.5770820798329055e-06, - "loss": 0.1718, - "step": 800 - }, - { - "epoch": 4.914110429447852, - "grad_norm": 3.450190305709839, - "learning_rate": 2.572265836885682e-06, - "loss": 0.0895, - "step": 801 - }, - { - "epoch": 4.920245398773006, - "grad_norm": 3.1145224571228027, - "learning_rate": 2.567449325492149e-06, - "loss": 0.0652, - "step": 802 - }, - { - "epoch": 4.92638036809816, - "grad_norm": 2.851768732070923, - "learning_rate": 2.5626325635442283e-06, - "loss": 0.0877, - "step": 803 - }, - { - "epoch": 4.932515337423313, - "grad_norm": 3.3392980098724365, - "learning_rate": 2.5578155689347716e-06, - "loss": 0.2028, - "step": 804 - }, - { - "epoch": 4.938650306748467, - "grad_norm": 3.012439250946045, - "learning_rate": 2.5529983595574964e-06, - "loss": 0.031, - "step": 805 - }, - { - "epoch": 4.9447852760736195, - "grad_norm": 2.7732717990875244, - "learning_rate": 2.548180953306918e-06, - "loss": 0.0415, - "step": 806 - }, - { - "epoch": 4.950920245398773, - "grad_norm": 3.0423903465270996, - "learning_rate": 2.5433633680782817e-06, - "loss": 0.1188, - "step": 807 - }, - { - "epoch": 4.957055214723926, - "grad_norm": 5.056387901306152, - "learning_rate": 2.538545621767498e-06, - "loss": 0.1703, - "step": 808 - }, - { - "epoch": 4.96319018404908, - "grad_norm": 4.052585124969482, - "learning_rate": 2.533727732271077e-06, - "loss": 0.1455, - "step": 809 - }, - { - "epoch": 4.969325153374233, - "grad_norm": 3.4507904052734375, - "learning_rate": 2.5289097174860593e-06, - "loss": 0.0617, - "step": 810 - }, - { - "epoch": 4.975460122699387, - "grad_norm": 2.908266305923462, - "learning_rate": 2.524091595309952e-06, - "loss": 0.1173, - "step": 811 - }, - { - "epoch": 4.9815950920245395, - "grad_norm": 2.5857458114624023, - "learning_rate": 2.519273383640661e-06, - "loss": 0.0538, - "step": 812 - }, - { - "epoch": 4.987730061349693, - "grad_norm": 3.3518428802490234, - "learning_rate": 2.5144551003764227e-06, - "loss": 0.211, - "step": 813 - }, - { - "epoch": 4.993865030674847, - "grad_norm": 3.137981653213501, - "learning_rate": 2.509636763415742e-06, - "loss": 0.0944, - "step": 814 - }, - { - "epoch": 5.0, - "grad_norm": 2.8854241371154785, - "learning_rate": 2.5048183906573227e-06, - "loss": 0.098, - "step": 815 - }, - { - "epoch": 5.006134969325154, - "grad_norm": 3.508527994155884, - "learning_rate": 2.5e-06, - "loss": 0.1102, - "step": 816 - }, - { - "epoch": 5.012269938650307, - "grad_norm": 2.448152542114258, - "learning_rate": 2.495181609342678e-06, - "loss": 0.0712, - "step": 817 - }, - { - "epoch": 5.0184049079754605, - "grad_norm": 3.105818748474121, - "learning_rate": 2.4903632365842587e-06, - "loss": 0.0414, - "step": 818 - }, - { - "epoch": 5.024539877300613, - "grad_norm": 3.8048601150512695, - "learning_rate": 2.4855448996235777e-06, - "loss": 0.0894, - "step": 819 - }, - { - "epoch": 5.030674846625767, - "grad_norm": 3.259834051132202, - "learning_rate": 2.48072661635934e-06, - "loss": 0.0796, - "step": 820 - }, - { - "epoch": 5.03680981595092, - "grad_norm": 2.822364568710327, - "learning_rate": 2.475908404690049e-06, - "loss": 0.0349, - "step": 821 - }, - { - "epoch": 5.042944785276074, - "grad_norm": 4.78808069229126, - "learning_rate": 2.4710902825139415e-06, - "loss": 0.2529, - "step": 822 - }, - { - "epoch": 5.049079754601227, - "grad_norm": 3.5420572757720947, - "learning_rate": 2.466272267728924e-06, - "loss": 0.1405, - "step": 823 - }, - { - "epoch": 5.0552147239263805, - "grad_norm": 2.500713348388672, - "learning_rate": 2.461454378232503e-06, - "loss": 0.0408, - "step": 824 - }, - { - "epoch": 5.061349693251533, - "grad_norm": 3.266291618347168, - "learning_rate": 2.4566366319217196e-06, - "loss": 0.0338, - "step": 825 - }, - { - "epoch": 5.067484662576687, - "grad_norm": 4.071012020111084, - "learning_rate": 2.4518190466930837e-06, - "loss": 0.06, - "step": 826 - }, - { - "epoch": 5.07361963190184, - "grad_norm": 4.3747172355651855, - "learning_rate": 2.4470016404425045e-06, - "loss": 0.1184, - "step": 827 - }, - { - "epoch": 5.079754601226994, - "grad_norm": 3.92030668258667, - "learning_rate": 2.4421844310652296e-06, - "loss": 0.1369, - "step": 828 - }, - { - "epoch": 5.085889570552148, - "grad_norm": 3.3482303619384766, - "learning_rate": 2.437367436455773e-06, - "loss": 0.1166, - "step": 829 - }, - { - "epoch": 5.0920245398773005, - "grad_norm": 3.429368019104004, - "learning_rate": 2.4325506745078524e-06, - "loss": 0.1214, - "step": 830 - }, - { - "epoch": 5.098159509202454, - "grad_norm": 3.4915647506713867, - "learning_rate": 2.427734163114319e-06, - "loss": 0.0454, - "step": 831 - }, - { - "epoch": 5.104294478527607, - "grad_norm": 3.1721251010894775, - "learning_rate": 2.4229179201670954e-06, - "loss": 0.0431, - "step": 832 - }, - { - "epoch": 5.110429447852761, - "grad_norm": 2.552578926086426, - "learning_rate": 2.418101963557107e-06, - "loss": 0.0347, - "step": 833 - }, - { - "epoch": 5.116564417177914, - "grad_norm": 3.518169403076172, - "learning_rate": 2.413286311174214e-06, - "loss": 0.1555, - "step": 834 - }, - { - "epoch": 5.122699386503068, - "grad_norm": 2.4452908039093018, - "learning_rate": 2.4084709809071487e-06, - "loss": 0.035, - "step": 835 - }, - { - "epoch": 5.128834355828221, - "grad_norm": 3.5366528034210205, - "learning_rate": 2.403655990643444e-06, - "loss": 0.0798, - "step": 836 - }, - { - "epoch": 5.134969325153374, - "grad_norm": 2.300065040588379, - "learning_rate": 2.398841358269371e-06, - "loss": 0.0178, - "step": 837 - }, - { - "epoch": 5.141104294478527, - "grad_norm": 2.851393699645996, - "learning_rate": 2.3940271016698733e-06, - "loss": 0.0447, - "step": 838 - }, - { - "epoch": 5.147239263803681, - "grad_norm": 4.085958957672119, - "learning_rate": 2.3892132387284956e-06, - "loss": 0.1626, - "step": 839 - }, - { - "epoch": 5.153374233128835, - "grad_norm": 3.4240522384643555, - "learning_rate": 2.384399787327322e-06, - "loss": 0.0914, - "step": 840 - }, - { - "epoch": 5.159509202453988, - "grad_norm": 4.111586570739746, - "learning_rate": 2.3795867653469072e-06, - "loss": 0.0784, - "step": 841 - }, - { - "epoch": 5.1656441717791415, - "grad_norm": 2.3306312561035156, - "learning_rate": 2.374774190666211e-06, - "loss": 0.0216, - "step": 842 - }, - { - "epoch": 5.171779141104294, - "grad_norm": 2.5006275177001953, - "learning_rate": 2.3699620811625327e-06, - "loss": 0.0516, - "step": 843 - }, - { - "epoch": 5.177914110429448, - "grad_norm": 3.1680967807769775, - "learning_rate": 2.365150454711441e-06, - "loss": 0.0517, - "step": 844 - }, - { - "epoch": 5.184049079754601, - "grad_norm": 1.817044734954834, - "learning_rate": 2.3603393291867122e-06, - "loss": 0.0264, - "step": 845 - }, - { - "epoch": 5.190184049079755, - "grad_norm": 4.445211887359619, - "learning_rate": 2.355528722460261e-06, - "loss": 0.1079, - "step": 846 - }, - { - "epoch": 5.196319018404908, - "grad_norm": 2.918304681777954, - "learning_rate": 2.350718652402076e-06, - "loss": 0.0633, - "step": 847 - }, - { - "epoch": 5.2024539877300615, - "grad_norm": 3.6307432651519775, - "learning_rate": 2.345909136880151e-06, - "loss": 0.1013, - "step": 848 - }, - { - "epoch": 5.208588957055214, - "grad_norm": 3.5696842670440674, - "learning_rate": 2.34110019376042e-06, - "loss": 0.0199, - "step": 849 - }, - { - "epoch": 5.214723926380368, - "grad_norm": 2.2214856147766113, - "learning_rate": 2.336291840906691e-06, - "loss": 0.0288, - "step": 850 - }, - { - "epoch": 5.220858895705521, - "grad_norm": 2.5375778675079346, - "learning_rate": 2.3314840961805806e-06, - "loss": 0.0142, - "step": 851 - }, - { - "epoch": 5.226993865030675, - "grad_norm": 3.0093517303466797, - "learning_rate": 2.326676977441444e-06, - "loss": 0.0911, - "step": 852 - }, - { - "epoch": 5.233128834355828, - "grad_norm": 2.7067151069641113, - "learning_rate": 2.3218705025463118e-06, - "loss": 0.0315, - "step": 853 - }, - { - "epoch": 5.2392638036809815, - "grad_norm": 3.1892940998077393, - "learning_rate": 2.3170646893498237e-06, - "loss": 0.1344, - "step": 854 - }, - { - "epoch": 5.245398773006135, - "grad_norm": 2.8909313678741455, - "learning_rate": 2.312259555704161e-06, - "loss": 0.034, - "step": 855 - }, - { - "epoch": 5.251533742331288, - "grad_norm": 5.097650051116943, - "learning_rate": 2.3074551194589816e-06, - "loss": 0.1889, - "step": 856 - }, - { - "epoch": 5.257668711656442, - "grad_norm": 3.8511006832122803, - "learning_rate": 2.3026513984613506e-06, - "loss": 0.0794, - "step": 857 - }, - { - "epoch": 5.263803680981595, - "grad_norm": 2.2874133586883545, - "learning_rate": 2.297848410555677e-06, - "loss": 0.0238, - "step": 858 - }, - { - "epoch": 5.269938650306749, - "grad_norm": 3.504723310470581, - "learning_rate": 2.293046173583648e-06, - "loss": 0.0369, - "step": 859 - }, - { - "epoch": 5.276073619631902, - "grad_norm": 3.2108154296875, - "learning_rate": 2.28824470538416e-06, - "loss": 0.0677, - "step": 860 - }, - { - "epoch": 5.282208588957055, - "grad_norm": 2.2249386310577393, - "learning_rate": 2.2834440237932537e-06, - "loss": 0.0244, - "step": 861 - }, - { - "epoch": 5.288343558282208, - "grad_norm": 3.141784191131592, - "learning_rate": 2.2786441466440474e-06, - "loss": 0.0628, - "step": 862 - }, - { - "epoch": 5.294478527607362, - "grad_norm": 3.5597352981567383, - "learning_rate": 2.2738450917666727e-06, - "loss": 0.0914, - "step": 863 - }, - { - "epoch": 5.300613496932515, - "grad_norm": 2.991966962814331, - "learning_rate": 2.269046876988204e-06, - "loss": 0.0546, - "step": 864 - }, - { - "epoch": 5.306748466257669, - "grad_norm": 3.100776195526123, - "learning_rate": 2.2642495201325995e-06, - "loss": 0.0473, - "step": 865 - }, - { - "epoch": 5.3128834355828225, - "grad_norm": 2.541754722595215, - "learning_rate": 2.259453039020626e-06, - "loss": 0.0613, - "step": 866 - }, - { - "epoch": 5.319018404907975, - "grad_norm": 2.8117194175720215, - "learning_rate": 2.2546574514697985e-06, - "loss": 0.0533, - "step": 867 - }, - { - "epoch": 5.325153374233129, - "grad_norm": 2.5676379203796387, - "learning_rate": 2.249862775294313e-06, - "loss": 0.018, - "step": 868 - }, - { - "epoch": 5.331288343558282, - "grad_norm": 2.5297701358795166, - "learning_rate": 2.245069028304981e-06, - "loss": 0.0246, - "step": 869 - }, - { - "epoch": 5.337423312883436, - "grad_norm": 2.199498176574707, - "learning_rate": 2.240276228309161e-06, - "loss": 0.0551, - "step": 870 - }, - { - "epoch": 5.343558282208589, - "grad_norm": 2.5793557167053223, - "learning_rate": 2.2354843931106933e-06, - "loss": 0.0258, - "step": 871 - }, - { - "epoch": 5.3496932515337425, - "grad_norm": 3.352058172225952, - "learning_rate": 2.230693540509836e-06, - "loss": 0.0228, - "step": 872 - }, - { - "epoch": 5.355828220858895, - "grad_norm": 2.900599956512451, - "learning_rate": 2.225903688303195e-06, - "loss": 0.0586, - "step": 873 - }, - { - "epoch": 5.361963190184049, - "grad_norm": 3.3317267894744873, - "learning_rate": 2.221114854283662e-06, - "loss": 0.0733, - "step": 874 - }, - { - "epoch": 5.368098159509202, - "grad_norm": 2.79304575920105, - "learning_rate": 2.2163270562403453e-06, - "loss": 0.0251, - "step": 875 - }, - { - "epoch": 5.374233128834356, - "grad_norm": 3.8596227169036865, - "learning_rate": 2.211540311958506e-06, - "loss": 0.0957, - "step": 876 - }, - { - "epoch": 5.38036809815951, - "grad_norm": 2.7464358806610107, - "learning_rate": 2.2067546392194888e-06, - "loss": 0.0457, - "step": 877 - }, - { - "epoch": 5.386503067484663, - "grad_norm": 2.3359906673431396, - "learning_rate": 2.2019700558006598e-06, - "loss": 0.0218, - "step": 878 - }, - { - "epoch": 5.392638036809816, - "grad_norm": 3.2412452697753906, - "learning_rate": 2.197186579475337e-06, - "loss": 0.0494, - "step": 879 - }, - { - "epoch": 5.398773006134969, - "grad_norm": 3.930197238922119, - "learning_rate": 2.1924042280127284e-06, - "loss": 0.0803, - "step": 880 - }, - { - "epoch": 5.404907975460123, - "grad_norm": 2.5752930641174316, - "learning_rate": 2.1876230191778598e-06, - "loss": 0.0356, - "step": 881 - }, - { - "epoch": 5.411042944785276, - "grad_norm": 5.507393836975098, - "learning_rate": 2.182842970731516e-06, - "loss": 0.1245, - "step": 882 - }, - { - "epoch": 5.41717791411043, - "grad_norm": 2.416719436645508, - "learning_rate": 2.17806410043017e-06, - "loss": 0.0224, - "step": 883 - }, - { - "epoch": 5.423312883435583, - "grad_norm": 2.500429630279541, - "learning_rate": 2.173286426025917e-06, - "loss": 0.0499, - "step": 884 - }, - { - "epoch": 5.429447852760736, - "grad_norm": 2.8843860626220703, - "learning_rate": 2.168509965266411e-06, - "loss": 0.075, - "step": 885 - }, - { - "epoch": 5.435582822085889, - "grad_norm": 2.3187198638916016, - "learning_rate": 2.1637347358947984e-06, - "loss": 0.065, - "step": 886 - }, - { - "epoch": 5.441717791411043, - "grad_norm": 2.7135889530181885, - "learning_rate": 2.15896075564965e-06, - "loss": 0.0848, - "step": 887 - }, - { - "epoch": 5.447852760736196, - "grad_norm": 1.751846194267273, - "learning_rate": 2.1541880422648978e-06, - "loss": 0.0112, - "step": 888 - }, - { - "epoch": 5.45398773006135, - "grad_norm": 3.113271713256836, - "learning_rate": 2.1494166134697655e-06, - "loss": 0.077, - "step": 889 - }, - { - "epoch": 5.460122699386503, - "grad_norm": 2.711318016052246, - "learning_rate": 2.1446464869887077e-06, - "loss": 0.03, - "step": 890 - }, - { - "epoch": 5.466257668711656, - "grad_norm": 1.8012003898620605, - "learning_rate": 2.13987768054134e-06, - "loss": 0.0141, - "step": 891 - }, - { - "epoch": 5.47239263803681, - "grad_norm": 2.0968120098114014, - "learning_rate": 2.135110211842374e-06, - "loss": 0.0147, - "step": 892 - }, - { - "epoch": 5.478527607361963, - "grad_norm": 3.1689956188201904, - "learning_rate": 2.1303440986015525e-06, - "loss": 0.1123, - "step": 893 - }, - { - "epoch": 5.484662576687117, - "grad_norm": 4.512697219848633, - "learning_rate": 2.1255793585235827e-06, - "loss": 0.0359, - "step": 894 - }, - { - "epoch": 5.49079754601227, - "grad_norm": 3.5739688873291016, - "learning_rate": 2.120816009308071e-06, - "loss": 0.0635, - "step": 895 - }, - { - "epoch": 5.4969325153374236, - "grad_norm": 4.556554317474365, - "learning_rate": 2.1160540686494597e-06, - "loss": 0.1104, - "step": 896 - }, - { - "epoch": 5.5030674846625764, - "grad_norm": 2.2047064304351807, - "learning_rate": 2.1112935542369546e-06, - "loss": 0.0187, - "step": 897 - }, - { - "epoch": 5.50920245398773, - "grad_norm": 3.0289857387542725, - "learning_rate": 2.106534483754466e-06, - "loss": 0.0874, - "step": 898 - }, - { - "epoch": 5.515337423312883, - "grad_norm": 2.7090444564819336, - "learning_rate": 2.1017768748805396e-06, - "loss": 0.0301, - "step": 899 - }, - { - "epoch": 5.521472392638037, - "grad_norm": 3.0662643909454346, - "learning_rate": 2.0970207452882917e-06, - "loss": 0.1192, - "step": 900 - }, - { - "epoch": 5.52760736196319, - "grad_norm": 2.869401454925537, - "learning_rate": 2.0922661126453436e-06, - "loss": 0.0803, - "step": 901 - }, - { - "epoch": 5.533742331288344, - "grad_norm": 2.229947328567505, - "learning_rate": 2.0875129946137557e-06, - "loss": 0.0186, - "step": 902 - }, - { - "epoch": 5.539877300613497, - "grad_norm": 3.3460421562194824, - "learning_rate": 2.0827614088499624e-06, - "loss": 0.0499, - "step": 903 - }, - { - "epoch": 5.54601226993865, - "grad_norm": 1.9324007034301758, - "learning_rate": 2.0780113730047056e-06, - "loss": 0.0322, - "step": 904 - }, - { - "epoch": 5.552147239263804, - "grad_norm": 2.761482000350952, - "learning_rate": 2.0732629047229712e-06, - "loss": 0.0265, - "step": 905 - }, - { - "epoch": 5.558282208588957, - "grad_norm": 2.4173266887664795, - "learning_rate": 2.0685160216439205e-06, - "loss": 0.0229, - "step": 906 - }, - { - "epoch": 5.564417177914111, - "grad_norm": 2.503661632537842, - "learning_rate": 2.0637707414008267e-06, - "loss": 0.0266, - "step": 907 - }, - { - "epoch": 5.570552147239264, - "grad_norm": 2.312236785888672, - "learning_rate": 2.0590270816210077e-06, - "loss": 0.018, - "step": 908 - }, - { - "epoch": 5.576687116564417, - "grad_norm": 2.569575548171997, - "learning_rate": 2.0542850599257647e-06, - "loss": 0.0377, - "step": 909 - }, - { - "epoch": 5.58282208588957, - "grad_norm": 3.520341157913208, - "learning_rate": 2.0495446939303122e-06, - "loss": 0.1224, - "step": 910 - }, - { - "epoch": 5.588957055214724, - "grad_norm": 3.231363296508789, - "learning_rate": 2.044806001243714e-06, - "loss": 0.1457, - "step": 911 - }, - { - "epoch": 5.595092024539877, - "grad_norm": 3.3211300373077393, - "learning_rate": 2.040068999468818e-06, - "loss": 0.0429, - "step": 912 - }, - { - "epoch": 5.601226993865031, - "grad_norm": 3.3712961673736572, - "learning_rate": 2.035333706202192e-06, - "loss": 0.0634, - "step": 913 - }, - { - "epoch": 5.6073619631901845, - "grad_norm": 2.480177402496338, - "learning_rate": 2.0306001390340565e-06, - "loss": 0.0178, - "step": 914 - }, - { - "epoch": 5.613496932515337, - "grad_norm": 2.9777421951293945, - "learning_rate": 2.02586831554822e-06, - "loss": 0.037, - "step": 915 - }, - { - "epoch": 5.61963190184049, - "grad_norm": 2.9129085540771484, - "learning_rate": 2.021138253322012e-06, - "loss": 0.125, - "step": 916 - }, - { - "epoch": 5.625766871165644, - "grad_norm": 4.041767597198486, - "learning_rate": 2.016409969926224e-06, - "loss": 0.1897, - "step": 917 - }, - { - "epoch": 5.631901840490798, - "grad_norm": 4.088902950286865, - "learning_rate": 2.0116834829250355e-06, - "loss": 0.0546, - "step": 918 - }, - { - "epoch": 5.638036809815951, - "grad_norm": 3.8629167079925537, - "learning_rate": 2.0069588098759545e-06, - "loss": 0.0911, - "step": 919 - }, - { - "epoch": 5.644171779141105, - "grad_norm": 2.616830825805664, - "learning_rate": 2.00223596832975e-06, - "loss": 0.0527, - "step": 920 - }, - { - "epoch": 5.6503067484662575, - "grad_norm": 1.9370782375335693, - "learning_rate": 1.9975149758303885e-06, - "loss": 0.0384, - "step": 921 - }, - { - "epoch": 5.656441717791411, - "grad_norm": 3.7839455604553223, - "learning_rate": 1.992795849914967e-06, - "loss": 0.1033, - "step": 922 - }, - { - "epoch": 5.662576687116564, - "grad_norm": 3.870729923248291, - "learning_rate": 1.9880786081136498e-06, - "loss": 0.08, - "step": 923 - }, - { - "epoch": 5.668711656441718, - "grad_norm": 3.4394288063049316, - "learning_rate": 1.9833632679496008e-06, - "loss": 0.0819, - "step": 924 - }, - { - "epoch": 5.674846625766871, - "grad_norm": 3.1659159660339355, - "learning_rate": 1.97864984693892e-06, - "loss": 0.117, - "step": 925 - }, - { - "epoch": 5.680981595092025, - "grad_norm": 2.2375190258026123, - "learning_rate": 1.97393836259058e-06, - "loss": 0.0215, - "step": 926 - }, - { - "epoch": 5.6871165644171775, - "grad_norm": 3.9375314712524414, - "learning_rate": 1.969228832406358e-06, - "loss": 0.1422, - "step": 927 - }, - { - "epoch": 5.693251533742331, - "grad_norm": 3.1969058513641357, - "learning_rate": 1.964521273880772e-06, - "loss": 0.0538, - "step": 928 - }, - { - "epoch": 5.699386503067485, - "grad_norm": 3.5990066528320312, - "learning_rate": 1.9598157045010162e-06, - "loss": 0.114, - "step": 929 - }, - { - "epoch": 5.705521472392638, - "grad_norm": 3.1764235496520996, - "learning_rate": 1.9551121417468955e-06, - "loss": 0.053, - "step": 930 - }, - { - "epoch": 5.711656441717792, - "grad_norm": 4.1162309646606445, - "learning_rate": 1.9504106030907605e-06, - "loss": 0.0866, - "step": 931 - }, - { - "epoch": 5.717791411042945, - "grad_norm": 3.543071985244751, - "learning_rate": 1.945711105997444e-06, - "loss": 0.0908, - "step": 932 - }, - { - "epoch": 5.723926380368098, - "grad_norm": 4.136870384216309, - "learning_rate": 1.941013667924194e-06, - "loss": 0.0612, - "step": 933 - }, - { - "epoch": 5.730061349693251, - "grad_norm": 1.7658357620239258, - "learning_rate": 1.9363183063206097e-06, - "loss": 0.0283, - "step": 934 - }, - { - "epoch": 5.736196319018405, - "grad_norm": 3.9701411724090576, - "learning_rate": 1.931625038628577e-06, - "loss": 0.0948, - "step": 935 - }, - { - "epoch": 5.742331288343558, - "grad_norm": 3.0636157989501953, - "learning_rate": 1.9269338822822047e-06, - "loss": 0.0769, - "step": 936 - }, - { - "epoch": 5.748466257668712, - "grad_norm": 3.3671388626098633, - "learning_rate": 1.9222448547077573e-06, - "loss": 0.098, - "step": 937 - }, - { - "epoch": 5.754601226993865, - "grad_norm": 3.0725975036621094, - "learning_rate": 1.917557973323591e-06, - "loss": 0.0363, - "step": 938 - }, - { - "epoch": 5.7607361963190185, - "grad_norm": 2.5592041015625, - "learning_rate": 1.9128732555400915e-06, - "loss": 0.0205, - "step": 939 - }, - { - "epoch": 5.766871165644172, - "grad_norm": 2.835740804672241, - "learning_rate": 1.9081907187596054e-06, - "loss": 0.0548, - "step": 940 - }, - { - "epoch": 5.773006134969325, - "grad_norm": 3.3596746921539307, - "learning_rate": 1.9035103803763793e-06, - "loss": 0.0454, - "step": 941 - }, - { - "epoch": 5.779141104294479, - "grad_norm": 3.226579427719116, - "learning_rate": 1.8988322577764918e-06, - "loss": 0.0514, - "step": 942 - }, - { - "epoch": 5.785276073619632, - "grad_norm": 3.2044687271118164, - "learning_rate": 1.8941563683377905e-06, - "loss": 0.1361, - "step": 943 - }, - { - "epoch": 5.791411042944786, - "grad_norm": 1.8300527334213257, - "learning_rate": 1.8894827294298296e-06, - "loss": 0.0139, - "step": 944 - }, - { - "epoch": 5.7975460122699385, - "grad_norm": 2.503735303878784, - "learning_rate": 1.884811358413801e-06, - "loss": 0.0311, - "step": 945 - }, - { - "epoch": 5.803680981595092, - "grad_norm": 2.171309471130371, - "learning_rate": 1.8801422726424735e-06, - "loss": 0.0227, - "step": 946 - }, - { - "epoch": 5.809815950920245, - "grad_norm": 1.8116636276245117, - "learning_rate": 1.8754754894601252e-06, - "loss": 0.0157, - "step": 947 - }, - { - "epoch": 5.815950920245399, - "grad_norm": 3.1412570476531982, - "learning_rate": 1.870811026202482e-06, - "loss": 0.1093, - "step": 948 - }, - { - "epoch": 5.822085889570552, - "grad_norm": 2.3962290287017822, - "learning_rate": 1.8661489001966526e-06, - "loss": 0.021, - "step": 949 - }, - { - "epoch": 5.828220858895706, - "grad_norm": 4.169166564941406, - "learning_rate": 1.8614891287610621e-06, - "loss": 0.0663, - "step": 950 - }, - { - "epoch": 5.8343558282208585, - "grad_norm": 3.1181528568267822, - "learning_rate": 1.8568317292053894e-06, - "loss": 0.1008, - "step": 951 - }, - { - "epoch": 5.840490797546012, - "grad_norm": 3.5155029296875, - "learning_rate": 1.8521767188305023e-06, - "loss": 0.0451, - "step": 952 - }, - { - "epoch": 5.846625766871165, - "grad_norm": 2.975693702697754, - "learning_rate": 1.8475241149283957e-06, - "loss": 0.0561, - "step": 953 - }, - { - "epoch": 5.852760736196319, - "grad_norm": 2.1581289768218994, - "learning_rate": 1.842873934782122e-06, - "loss": 0.0265, - "step": 954 - }, - { - "epoch": 5.858895705521473, - "grad_norm": 2.6281228065490723, - "learning_rate": 1.8382261956657318e-06, - "loss": 0.1196, - "step": 955 - }, - { - "epoch": 5.865030674846626, - "grad_norm": 2.9569528102874756, - "learning_rate": 1.8335809148442074e-06, - "loss": 0.1356, - "step": 956 - }, - { - "epoch": 5.871165644171779, - "grad_norm": 2.450949192047119, - "learning_rate": 1.8289381095734005e-06, - "loss": 0.0444, - "step": 957 - }, - { - "epoch": 5.877300613496932, - "grad_norm": 2.1737027168273926, - "learning_rate": 1.8242977970999643e-06, - "loss": 0.0622, - "step": 958 - }, - { - "epoch": 5.883435582822086, - "grad_norm": 3.350647211074829, - "learning_rate": 1.8196599946612956e-06, - "loss": 0.0762, - "step": 959 - }, - { - "epoch": 5.889570552147239, - "grad_norm": 2.5031936168670654, - "learning_rate": 1.8150247194854642e-06, - "loss": 0.0207, - "step": 960 - }, - { - "epoch": 5.895705521472393, - "grad_norm": 3.7103707790374756, - "learning_rate": 1.8103919887911525e-06, - "loss": 0.1122, - "step": 961 - }, - { - "epoch": 5.901840490797546, - "grad_norm": 2.485322952270508, - "learning_rate": 1.8057618197875914e-06, - "loss": 0.0284, - "step": 962 - }, - { - "epoch": 5.9079754601226995, - "grad_norm": 1.903212547302246, - "learning_rate": 1.8011342296744961e-06, - "loss": 0.0239, - "step": 963 - }, - { - "epoch": 5.914110429447852, - "grad_norm": 3.015552520751953, - "learning_rate": 1.796509235642001e-06, - "loss": 0.0425, - "step": 964 - }, - { - "epoch": 5.920245398773006, - "grad_norm": 4.806198596954346, - "learning_rate": 1.7918868548705982e-06, - "loss": 0.2094, - "step": 965 - }, - { - "epoch": 5.92638036809816, - "grad_norm": 2.949596643447876, - "learning_rate": 1.7872671045310703e-06, - "loss": 0.0632, - "step": 966 - }, - { - "epoch": 5.932515337423313, - "grad_norm": 4.153099536895752, - "learning_rate": 1.782650001784431e-06, - "loss": 0.1411, - "step": 967 - }, - { - "epoch": 5.938650306748467, - "grad_norm": 3.4117565155029297, - "learning_rate": 1.7780355637818568e-06, - "loss": 0.0965, - "step": 968 - }, - { - "epoch": 5.9447852760736195, - "grad_norm": 2.533405303955078, - "learning_rate": 1.7734238076646277e-06, - "loss": 0.0568, - "step": 969 - }, - { - "epoch": 5.950920245398773, - "grad_norm": 2.3604726791381836, - "learning_rate": 1.7688147505640581e-06, - "loss": 0.0182, - "step": 970 - }, - { - "epoch": 5.957055214723926, - "grad_norm": 3.807424306869507, - "learning_rate": 1.7642084096014405e-06, - "loss": 0.0547, - "step": 971 - }, - { - "epoch": 5.96319018404908, - "grad_norm": 2.5735342502593994, - "learning_rate": 1.759604801887974e-06, - "loss": 0.0775, - "step": 972 - }, - { - "epoch": 5.969325153374233, - "grad_norm": 2.9217734336853027, - "learning_rate": 1.7550039445247069e-06, - "loss": 0.0541, - "step": 973 - }, - { - "epoch": 5.975460122699387, - "grad_norm": 2.793104410171509, - "learning_rate": 1.7504058546024694e-06, - "loss": 0.0257, - "step": 974 - }, - { - "epoch": 5.9815950920245395, - "grad_norm": 3.5610134601593018, - "learning_rate": 1.7458105492018114e-06, - "loss": 0.0767, - "step": 975 - }, - { - "epoch": 5.987730061349693, - "grad_norm": 2.0738015174865723, - "learning_rate": 1.7412180453929412e-06, - "loss": 0.025, - "step": 976 - }, - { - "epoch": 5.993865030674847, - "grad_norm": 2.1248421669006348, - "learning_rate": 1.736628360235657e-06, - "loss": 0.0183, - "step": 977 - }, - { - "epoch": 6.0, - "grad_norm": 2.901273727416992, - "learning_rate": 1.7320415107792893e-06, - "loss": 0.1369, - "step": 978 - }, - { - "epoch": 6.006134969325154, - "grad_norm": 3.815110683441162, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.1011, - "step": 979 - }, - { - "epoch": 6.012269938650307, - "grad_norm": 2.421208381652832, - "learning_rate": 1.7228763871138845e-06, - "loss": 0.0105, - "step": 980 - }, - { - "epoch": 6.0184049079754605, - "grad_norm": 2.7103846073150635, - "learning_rate": 1.718298146950585e-06, - "loss": 0.0373, - "step": 981 - }, - { - "epoch": 6.024539877300613, - "grad_norm": 1.3751411437988281, - "learning_rate": 1.7137228105795473e-06, - "loss": 0.0072, - "step": 982 - }, - { - "epoch": 6.030674846625767, - "grad_norm": 1.5235071182250977, - "learning_rate": 1.7091503949967987e-06, - "loss": 0.0126, - "step": 983 - }, - { - "epoch": 6.03680981595092, - "grad_norm": 2.0652546882629395, - "learning_rate": 1.7045809171875183e-06, - "loss": 0.0198, - "step": 984 - }, - { - "epoch": 6.042944785276074, - "grad_norm": 2.010207176208496, - "learning_rate": 1.70001439412597e-06, - "loss": 0.0186, - "step": 985 - }, - { - "epoch": 6.049079754601227, - "grad_norm": 2.0444021224975586, - "learning_rate": 1.6954508427754435e-06, - "loss": 0.0197, - "step": 986 - }, - { - "epoch": 6.0552147239263805, - "grad_norm": 2.6540091037750244, - "learning_rate": 1.690890280088187e-06, - "loss": 0.0192, - "step": 987 - }, - { - "epoch": 6.061349693251533, - "grad_norm": 1.6479653120040894, - "learning_rate": 1.6863327230053506e-06, - "loss": 0.0105, - "step": 988 - }, - { - "epoch": 6.067484662576687, - "grad_norm": 2.4434754848480225, - "learning_rate": 1.6817781884569146e-06, - "loss": 0.0275, - "step": 989 - }, - { - "epoch": 6.07361963190184, - "grad_norm": 1.7472137212753296, - "learning_rate": 1.677226693361636e-06, - "loss": 0.0095, - "step": 990 - }, - { - "epoch": 6.079754601226994, - "grad_norm": 2.952821969985962, - "learning_rate": 1.6726782546269793e-06, - "loss": 0.0483, - "step": 991 - }, - { - "epoch": 6.085889570552148, - "grad_norm": 3.123959541320801, - "learning_rate": 1.6681328891490544e-06, - "loss": 0.0815, - "step": 992 - }, - { - "epoch": 6.0920245398773005, - "grad_norm": 2.9924800395965576, - "learning_rate": 1.663590613812556e-06, - "loss": 0.0216, - "step": 993 - }, - { - "epoch": 6.098159509202454, - "grad_norm": 2.417778730392456, - "learning_rate": 1.6590514454907007e-06, - "loss": 0.0243, - "step": 994 - }, - { - "epoch": 6.104294478527607, - "grad_norm": 2.0682942867279053, - "learning_rate": 1.6545154010451613e-06, - "loss": 0.0669, - "step": 995 - }, - { - "epoch": 6.110429447852761, - "grad_norm": 2.9801135063171387, - "learning_rate": 1.6499824973260086e-06, - "loss": 0.0309, - "step": 996 - }, - { - "epoch": 6.116564417177914, - "grad_norm": 1.5753487348556519, - "learning_rate": 1.645452751171645e-06, - "loss": 0.026, - "step": 997 - }, - { - "epoch": 6.122699386503068, - "grad_norm": 2.461124897003174, - "learning_rate": 1.6409261794087438e-06, - "loss": 0.0191, - "step": 998 - }, - { - "epoch": 6.128834355828221, - "grad_norm": 3.839308261871338, - "learning_rate": 1.6364027988521875e-06, - "loss": 0.045, - "step": 999 - }, - { - "epoch": 6.134969325153374, - "grad_norm": 2.9653189182281494, - "learning_rate": 1.6318826263050022e-06, - "loss": 0.0197, - "step": 1000 - }, - { - "epoch": 6.141104294478527, - "grad_norm": 1.1804074048995972, - "learning_rate": 1.6273656785582986e-06, - "loss": 0.0092, - "step": 1001 - }, - { - "epoch": 6.147239263803681, - "grad_norm": 1.9027175903320312, - "learning_rate": 1.6228519723912073e-06, - "loss": 0.0141, - "step": 1002 - }, - { - "epoch": 6.153374233128835, - "grad_norm": 1.831039309501648, - "learning_rate": 1.618341524570819e-06, - "loss": 0.0131, - "step": 1003 - }, - { - "epoch": 6.159509202453988, - "grad_norm": 2.547327756881714, - "learning_rate": 1.613834351852119e-06, - "loss": 0.0686, - "step": 1004 - }, - { - "epoch": 6.1656441717791415, - "grad_norm": 2.746947765350342, - "learning_rate": 1.6093304709779273e-06, - "loss": 0.036, - "step": 1005 - }, - { - "epoch": 6.171779141104294, - "grad_norm": 2.0104732513427734, - "learning_rate": 1.6048298986788345e-06, - "loss": 0.0216, - "step": 1006 - }, - { - "epoch": 6.177914110429448, - "grad_norm": 2.655977725982666, - "learning_rate": 1.6003326516731431e-06, - "loss": 0.024, - "step": 1007 - }, - { - "epoch": 6.184049079754601, - "grad_norm": 2.0733132362365723, - "learning_rate": 1.5958387466668015e-06, - "loss": 0.0133, - "step": 1008 - }, - { - "epoch": 6.190184049079755, - "grad_norm": 2.5398054122924805, - "learning_rate": 1.5913482003533437e-06, - "loss": 0.0331, - "step": 1009 - }, - { - "epoch": 6.196319018404908, - "grad_norm": 1.7983721494674683, - "learning_rate": 1.5868610294138264e-06, - "loss": 0.0111, - "step": 1010 - }, - { - "epoch": 6.2024539877300615, - "grad_norm": 1.7259647846221924, - "learning_rate": 1.58237725051677e-06, - "loss": 0.0112, - "step": 1011 - }, - { - "epoch": 6.208588957055214, - "grad_norm": 1.7722725868225098, - "learning_rate": 1.577896880318093e-06, - "loss": 0.0181, - "step": 1012 - }, - { - "epoch": 6.214723926380368, - "grad_norm": 3.633545398712158, - "learning_rate": 1.5734199354610513e-06, - "loss": 0.0135, - "step": 1013 - }, - { - "epoch": 6.220858895705521, - "grad_norm": 1.8951494693756104, - "learning_rate": 1.5689464325761764e-06, - "loss": 0.0163, - "step": 1014 - }, - { - "epoch": 6.226993865030675, - "grad_norm": 1.637170433998108, - "learning_rate": 1.564476388281216e-06, - "loss": 0.0068, - "step": 1015 - }, - { - "epoch": 6.233128834355828, - "grad_norm": 2.2963850498199463, - "learning_rate": 1.5600098191810682e-06, - "loss": 0.021, - "step": 1016 - }, - { - "epoch": 6.2392638036809815, - "grad_norm": 2.777996063232422, - "learning_rate": 1.555546741867722e-06, - "loss": 0.0349, - "step": 1017 - }, - { - "epoch": 6.245398773006135, - "grad_norm": 2.1580724716186523, - "learning_rate": 1.5510871729201953e-06, - "loss": 0.0626, - "step": 1018 - }, - { - "epoch": 6.251533742331288, - "grad_norm": 1.4158363342285156, - "learning_rate": 1.5466311289044755e-06, - "loss": 0.0082, - "step": 1019 - }, - { - "epoch": 6.257668711656442, - "grad_norm": 3.287564516067505, - "learning_rate": 1.5421786263734524e-06, - "loss": 0.0212, - "step": 1020 - }, - { - "epoch": 6.263803680981595, - "grad_norm": 2.4552016258239746, - "learning_rate": 1.5377296818668638e-06, - "loss": 0.0963, - "step": 1021 - }, - { - "epoch": 6.269938650306749, - "grad_norm": 1.877556562423706, - "learning_rate": 1.5332843119112285e-06, - "loss": 0.011, - "step": 1022 - }, - { - "epoch": 6.276073619631902, - "grad_norm": 3.720372438430786, - "learning_rate": 1.5288425330197864e-06, - "loss": 0.018, - "step": 1023 - }, - { - "epoch": 6.282208588957055, - "grad_norm": 1.9751925468444824, - "learning_rate": 1.5244043616924389e-06, - "loss": 0.0162, - "step": 1024 - }, - { - "epoch": 6.288343558282208, - "grad_norm": 2.5137453079223633, - "learning_rate": 1.5199698144156865e-06, - "loss": 0.0468, - "step": 1025 - }, - { - "epoch": 6.294478527607362, - "grad_norm": 2.111983299255371, - "learning_rate": 1.5155389076625663e-06, - "loss": 0.0064, - "step": 1026 - }, - { - "epoch": 6.300613496932515, - "grad_norm": 2.572223663330078, - "learning_rate": 1.5111116578925924e-06, - "loss": 0.035, - "step": 1027 - }, - { - "epoch": 6.306748466257669, - "grad_norm": 2.7881019115448, - "learning_rate": 1.5066880815516943e-06, - "loss": 0.0197, - "step": 1028 - }, - { - "epoch": 6.3128834355828225, - "grad_norm": 1.2287017107009888, - "learning_rate": 1.5022681950721565e-06, - "loss": 0.0059, - "step": 1029 - }, - { - "epoch": 6.319018404907975, - "grad_norm": 1.764028549194336, - "learning_rate": 1.4978520148725558e-06, - "loss": 0.006, - "step": 1030 - }, - { - "epoch": 6.325153374233129, - "grad_norm": 2.399787664413452, - "learning_rate": 1.4934395573577016e-06, - "loss": 0.0126, - "step": 1031 - }, - { - "epoch": 6.331288343558282, - "grad_norm": 1.9056172370910645, - "learning_rate": 1.4890308389185743e-06, - "loss": 0.0131, - "step": 1032 - }, - { - "epoch": 6.337423312883436, - "grad_norm": 1.7394744157791138, - "learning_rate": 1.484625875932265e-06, - "loss": 0.016, - "step": 1033 - }, - { - "epoch": 6.343558282208589, - "grad_norm": 4.352719306945801, - "learning_rate": 1.480224684761915e-06, - "loss": 0.1059, - "step": 1034 - }, - { - "epoch": 6.3496932515337425, - "grad_norm": 2.148385524749756, - "learning_rate": 1.4758272817566538e-06, - "loss": 0.0312, - "step": 1035 - }, - { - "epoch": 6.355828220858895, - "grad_norm": 2.483872175216675, - "learning_rate": 1.4714336832515386e-06, - "loss": 0.0215, - "step": 1036 - }, - { - "epoch": 6.361963190184049, - "grad_norm": 2.6151270866394043, - "learning_rate": 1.467043905567494e-06, - "loss": 0.0718, - "step": 1037 - }, - { - "epoch": 6.368098159509202, - "grad_norm": 2.554600954055786, - "learning_rate": 1.4626579650112533e-06, - "loss": 0.0166, - "step": 1038 - }, - { - "epoch": 6.374233128834356, - "grad_norm": 3.013974905014038, - "learning_rate": 1.4582758778752926e-06, - "loss": 0.0448, - "step": 1039 - }, - { - "epoch": 6.38036809815951, - "grad_norm": 2.1542789936065674, - "learning_rate": 1.4538976604377781e-06, - "loss": 0.0297, - "step": 1040 - }, - { - "epoch": 6.386503067484663, - "grad_norm": 3.4402377605438232, - "learning_rate": 1.449523328962496e-06, - "loss": 0.0409, - "step": 1041 - }, - { - "epoch": 6.392638036809816, - "grad_norm": 1.6200538873672485, - "learning_rate": 1.4451528996988018e-06, - "loss": 0.0127, - "step": 1042 - }, - { - "epoch": 6.398773006134969, - "grad_norm": 3.081733465194702, - "learning_rate": 1.4407863888815527e-06, - "loss": 0.0788, - "step": 1043 - }, - { - "epoch": 6.404907975460123, - "grad_norm": 1.9813143014907837, - "learning_rate": 1.436423812731051e-06, - "loss": 0.0082, - "step": 1044 - }, - { - "epoch": 6.411042944785276, - "grad_norm": 1.7354048490524292, - "learning_rate": 1.432065187452984e-06, - "loss": 0.0086, - "step": 1045 - }, - { - "epoch": 6.41717791411043, - "grad_norm": 1.8812576532363892, - "learning_rate": 1.4277105292383594e-06, - "loss": 0.04, - "step": 1046 - }, - { - "epoch": 6.423312883435583, - "grad_norm": 1.117837905883789, - "learning_rate": 1.4233598542634519e-06, - "loss": 0.0054, - "step": 1047 - }, - { - "epoch": 6.429447852760736, - "grad_norm": 1.9587867259979248, - "learning_rate": 1.4190131786897388e-06, - "loss": 0.0263, - "step": 1048 - }, - { - "epoch": 6.435582822085889, - "grad_norm": 1.2712376117706299, - "learning_rate": 1.4146705186638388e-06, - "loss": 0.0098, - "step": 1049 - }, - { - "epoch": 6.441717791411043, - "grad_norm": 2.6563849449157715, - "learning_rate": 1.410331890317457e-06, - "loss": 0.0322, - "step": 1050 - }, - { - "epoch": 6.447852760736196, - "grad_norm": 3.136518955230713, - "learning_rate": 1.4059973097673187e-06, - "loss": 0.0729, - "step": 1051 - }, - { - "epoch": 6.45398773006135, - "grad_norm": 1.3937572240829468, - "learning_rate": 1.4016667931151156e-06, - "loss": 0.0094, - "step": 1052 - }, - { - "epoch": 6.460122699386503, - "grad_norm": 1.7218928337097168, - "learning_rate": 1.3973403564474422e-06, - "loss": 0.0078, - "step": 1053 - }, - { - "epoch": 6.466257668711656, - "grad_norm": 2.35612416267395, - "learning_rate": 1.393018015835737e-06, - "loss": 0.0231, - "step": 1054 - }, - { - "epoch": 6.47239263803681, - "grad_norm": 1.96125066280365, - "learning_rate": 1.388699787336224e-06, - "loss": 0.0153, - "step": 1055 - }, - { - "epoch": 6.478527607361963, - "grad_norm": 2.1789233684539795, - "learning_rate": 1.3843856869898486e-06, - "loss": 0.0136, - "step": 1056 - }, - { - "epoch": 6.484662576687117, - "grad_norm": 3.1261701583862305, - "learning_rate": 1.3800757308222263e-06, - "loss": 0.0819, - "step": 1057 - }, - { - "epoch": 6.49079754601227, - "grad_norm": 2.93422794342041, - "learning_rate": 1.3757699348435726e-06, - "loss": 0.0658, - "step": 1058 - }, - { - "epoch": 6.4969325153374236, - "grad_norm": 2.1311776638031006, - "learning_rate": 1.3714683150486534e-06, - "loss": 0.0106, - "step": 1059 - }, - { - "epoch": 6.5030674846625764, - "grad_norm": 1.699877381324768, - "learning_rate": 1.3671708874167211e-06, - "loss": 0.0151, - "step": 1060 - }, - { - "epoch": 6.50920245398773, - "grad_norm": 1.7288825511932373, - "learning_rate": 1.3628776679114516e-06, - "loss": 0.0114, - "step": 1061 - }, - { - "epoch": 6.515337423312883, - "grad_norm": 1.8437966108322144, - "learning_rate": 1.3585886724808934e-06, - "loss": 0.0117, - "step": 1062 - }, - { - "epoch": 6.521472392638037, - "grad_norm": 3.073568344116211, - "learning_rate": 1.3543039170574022e-06, - "loss": 0.0381, - "step": 1063 - }, - { - "epoch": 6.52760736196319, - "grad_norm": 1.6069157123565674, - "learning_rate": 1.350023417557581e-06, - "loss": 0.0072, - "step": 1064 - }, - { - "epoch": 6.533742331288344, - "grad_norm": 2.48502779006958, - "learning_rate": 1.345747189882228e-06, - "loss": 0.0302, - "step": 1065 - }, - { - "epoch": 6.539877300613497, - "grad_norm": 1.6879143714904785, - "learning_rate": 1.3414752499162676e-06, - "loss": 0.0095, - "step": 1066 - }, - { - "epoch": 6.54601226993865, - "grad_norm": 2.2126848697662354, - "learning_rate": 1.3372076135287005e-06, - "loss": 0.067, - "step": 1067 - }, - { - "epoch": 6.552147239263804, - "grad_norm": 2.157269239425659, - "learning_rate": 1.33294429657254e-06, - "loss": 0.0203, - "step": 1068 - }, - { - "epoch": 6.558282208588957, - "grad_norm": 2.725158452987671, - "learning_rate": 1.3286853148847523e-06, - "loss": 0.0217, - "step": 1069 - }, - { - "epoch": 6.564417177914111, - "grad_norm": 2.478426456451416, - "learning_rate": 1.3244306842862007e-06, - "loss": 0.0223, - "step": 1070 - }, - { - "epoch": 6.570552147239264, - "grad_norm": 2.349463939666748, - "learning_rate": 1.3201804205815872e-06, - "loss": 0.027, - "step": 1071 - }, - { - "epoch": 6.576687116564417, - "grad_norm": 2.049593210220337, - "learning_rate": 1.3159345395593876e-06, - "loss": 0.0212, - "step": 1072 - }, - { - "epoch": 6.58282208588957, - "grad_norm": 2.3445141315460205, - "learning_rate": 1.3116930569918024e-06, - "loss": 0.0182, - "step": 1073 - }, - { - "epoch": 6.588957055214724, - "grad_norm": 3.756135940551758, - "learning_rate": 1.3074559886346886e-06, - "loss": 0.1187, - "step": 1074 - }, - { - "epoch": 6.595092024539877, - "grad_norm": 2.4747114181518555, - "learning_rate": 1.3032233502275089e-06, - "loss": 0.0103, - "step": 1075 - }, - { - "epoch": 6.601226993865031, - "grad_norm": 2.0029311180114746, - "learning_rate": 1.2989951574932693e-06, - "loss": 0.0115, - "step": 1076 - }, - { - "epoch": 6.6073619631901845, - "grad_norm": 2.007141351699829, - "learning_rate": 1.2947714261384602e-06, - "loss": 0.0155, - "step": 1077 - }, - { - "epoch": 6.613496932515337, - "grad_norm": 1.5075048208236694, - "learning_rate": 1.2905521718530012e-06, - "loss": 0.0125, - "step": 1078 - }, - { - "epoch": 6.61963190184049, - "grad_norm": 1.9235132932662964, - "learning_rate": 1.2863374103101784e-06, - "loss": 0.0181, - "step": 1079 - }, - { - "epoch": 6.625766871165644, - "grad_norm": 1.7235040664672852, - "learning_rate": 1.2821271571665912e-06, - "loss": 0.0102, - "step": 1080 - }, - { - "epoch": 6.631901840490798, - "grad_norm": 3.503974676132202, - "learning_rate": 1.277921428062091e-06, - "loss": 0.0969, - "step": 1081 - }, - { - "epoch": 6.638036809815951, - "grad_norm": 2.4633288383483887, - "learning_rate": 1.2737202386197222e-06, - "loss": 0.0383, - "step": 1082 - }, - { - "epoch": 6.644171779141105, - "grad_norm": 2.332341432571411, - "learning_rate": 1.2695236044456672e-06, - "loss": 0.0184, - "step": 1083 - }, - { - "epoch": 6.6503067484662575, - "grad_norm": 2.8279805183410645, - "learning_rate": 1.2653315411291867e-06, - "loss": 0.0327, - "step": 1084 - }, - { - "epoch": 6.656441717791411, - "grad_norm": 2.444810628890991, - "learning_rate": 1.2611440642425617e-06, - "loss": 0.0399, - "step": 1085 - }, - { - "epoch": 6.662576687116564, - "grad_norm": 2.9304957389831543, - "learning_rate": 1.2569611893410374e-06, - "loss": 0.0385, - "step": 1086 - }, - { - "epoch": 6.668711656441718, - "grad_norm": 2.1244678497314453, - "learning_rate": 1.2527829319627604e-06, - "loss": 0.0123, - "step": 1087 - }, - { - "epoch": 6.674846625766871, - "grad_norm": 2.129033327102661, - "learning_rate": 1.248609307628729e-06, - "loss": 0.0302, - "step": 1088 - }, - { - "epoch": 6.680981595092025, - "grad_norm": 5.788925647735596, - "learning_rate": 1.2444403318427268e-06, - "loss": 0.0296, - "step": 1089 - }, - { - "epoch": 6.6871165644171775, - "grad_norm": 5.127935886383057, - "learning_rate": 1.2402760200912725e-06, - "loss": 0.1532, - "step": 1090 - }, - { - "epoch": 6.693251533742331, - "grad_norm": 2.2610318660736084, - "learning_rate": 1.2361163878435594e-06, - "loss": 0.0126, - "step": 1091 - }, - { - "epoch": 6.699386503067485, - "grad_norm": 1.7913328409194946, - "learning_rate": 1.2319614505513953e-06, - "loss": 0.0086, - "step": 1092 - }, - { - "epoch": 6.705521472392638, - "grad_norm": 1.5961267948150635, - "learning_rate": 1.227811223649149e-06, - "loss": 0.0041, - "step": 1093 - }, - { - "epoch": 6.711656441717792, - "grad_norm": 1.441754937171936, - "learning_rate": 1.2236657225536938e-06, - "loss": 0.0103, - "step": 1094 - }, - { - "epoch": 6.717791411042945, - "grad_norm": 1.4393174648284912, - "learning_rate": 1.2195249626643432e-06, - "loss": 0.0063, - "step": 1095 - }, - { - "epoch": 6.723926380368098, - "grad_norm": 3.199451208114624, - "learning_rate": 1.2153889593628032e-06, - "loss": 0.0571, - "step": 1096 - }, - { - "epoch": 6.730061349693251, - "grad_norm": 2.1796770095825195, - "learning_rate": 1.211257728013107e-06, - "loss": 0.0269, - "step": 1097 - }, - { - "epoch": 6.736196319018405, - "grad_norm": 3.1798806190490723, - "learning_rate": 1.2071312839615634e-06, - "loss": 0.0396, - "step": 1098 - }, - { - "epoch": 6.742331288343558, - "grad_norm": 3.063633680343628, - "learning_rate": 1.2030096425366985e-06, - "loss": 0.0261, - "step": 1099 - }, - { - "epoch": 6.748466257668712, - "grad_norm": 1.860409140586853, - "learning_rate": 1.1988928190491948e-06, - "loss": 0.013, - "step": 1100 - }, - { - "epoch": 6.754601226993865, - "grad_norm": 1.9303224086761475, - "learning_rate": 1.1947808287918406e-06, - "loss": 0.0113, - "step": 1101 - }, - { - "epoch": 6.7607361963190185, - "grad_norm": 2.1432337760925293, - "learning_rate": 1.19067368703947e-06, - "loss": 0.0195, - "step": 1102 - }, - { - "epoch": 6.766871165644172, - "grad_norm": 1.8998470306396484, - "learning_rate": 1.1865714090489038e-06, - "loss": 0.0105, - "step": 1103 - }, - { - "epoch": 6.773006134969325, - "grad_norm": 2.3260247707366943, - "learning_rate": 1.1824740100588991e-06, - "loss": 0.0554, - "step": 1104 - }, - { - "epoch": 6.779141104294479, - "grad_norm": 1.9272006750106812, - "learning_rate": 1.1783815052900848e-06, - "loss": 0.0118, - "step": 1105 - }, - { - "epoch": 6.785276073619632, - "grad_norm": 3.1646785736083984, - "learning_rate": 1.1742939099449126e-06, - "loss": 0.0901, - "step": 1106 - }, - { - "epoch": 6.791411042944786, - "grad_norm": 3.357422351837158, - "learning_rate": 1.1702112392075966e-06, - "loss": 0.0833, - "step": 1107 - }, - { - "epoch": 6.7975460122699385, - "grad_norm": 1.4302526712417603, - "learning_rate": 1.1661335082440545e-06, - "loss": 0.0078, - "step": 1108 - }, - { - "epoch": 6.803680981595092, - "grad_norm": 1.3046417236328125, - "learning_rate": 1.1620607322018587e-06, - "loss": 0.0092, - "step": 1109 - }, - { - "epoch": 6.809815950920245, - "grad_norm": 2.084237813949585, - "learning_rate": 1.1579929262101712e-06, - "loss": 0.0283, - "step": 1110 - }, - { - "epoch": 6.815950920245399, - "grad_norm": 1.9403250217437744, - "learning_rate": 1.153930105379695e-06, - "loss": 0.0066, - "step": 1111 - }, - { - "epoch": 6.822085889570552, - "grad_norm": 2.282449722290039, - "learning_rate": 1.1498722848026142e-06, - "loss": 0.0402, - "step": 1112 - }, - { - "epoch": 6.828220858895706, - "grad_norm": 1.9357627630233765, - "learning_rate": 1.1458194795525354e-06, - "loss": 0.0101, - "step": 1113 - }, - { - "epoch": 6.8343558282208585, - "grad_norm": 2.0236339569091797, - "learning_rate": 1.1417717046844385e-06, - "loss": 0.0109, - "step": 1114 - }, - { - "epoch": 6.840490797546012, - "grad_norm": 2.386857032775879, - "learning_rate": 1.137728975234615e-06, - "loss": 0.0297, - "step": 1115 - }, - { - "epoch": 6.846625766871165, - "grad_norm": 2.2477970123291016, - "learning_rate": 1.1336913062206157e-06, - "loss": 0.0393, - "step": 1116 - }, - { - "epoch": 6.852760736196319, - "grad_norm": 2.7217776775360107, - "learning_rate": 1.129658712641192e-06, - "loss": 0.0269, - "step": 1117 - }, - { - "epoch": 6.858895705521473, - "grad_norm": 2.6717259883880615, - "learning_rate": 1.125631209476241e-06, - "loss": 0.0708, - "step": 1118 - }, - { - "epoch": 6.865030674846626, - "grad_norm": 2.951939344406128, - "learning_rate": 1.1216088116867524e-06, - "loss": 0.0835, - "step": 1119 - }, - { - "epoch": 6.871165644171779, - "grad_norm": 1.9705166816711426, - "learning_rate": 1.1175915342147486e-06, - "loss": 0.0107, - "step": 1120 - }, - { - "epoch": 6.877300613496932, - "grad_norm": 2.4005937576293945, - "learning_rate": 1.1135793919832336e-06, - "loss": 0.0139, - "step": 1121 - }, - { - "epoch": 6.883435582822086, - "grad_norm": 2.277463674545288, - "learning_rate": 1.1095723998961353e-06, - "loss": 0.0154, - "step": 1122 - }, - { - "epoch": 6.889570552147239, - "grad_norm": 1.5026034116744995, - "learning_rate": 1.1055705728382482e-06, - "loss": 0.0072, - "step": 1123 - }, - { - "epoch": 6.895705521472393, - "grad_norm": 1.9540379047393799, - "learning_rate": 1.1015739256751826e-06, - "loss": 0.0202, - "step": 1124 - }, - { - "epoch": 6.901840490797546, - "grad_norm": 2.3090603351593018, - "learning_rate": 1.0975824732533066e-06, - "loss": 0.0559, - "step": 1125 - }, - { - "epoch": 6.9079754601226995, - "grad_norm": 2.100283622741699, - "learning_rate": 1.09359623039969e-06, - "loss": 0.0385, - "step": 1126 - }, - { - "epoch": 6.914110429447852, - "grad_norm": 2.4120566844940186, - "learning_rate": 1.0896152119220525e-06, - "loss": 0.0535, - "step": 1127 - }, - { - "epoch": 6.920245398773006, - "grad_norm": 2.003495454788208, - "learning_rate": 1.0856394326087045e-06, - "loss": 0.0104, - "step": 1128 - }, - { - "epoch": 6.92638036809816, - "grad_norm": 1.6565535068511963, - "learning_rate": 1.0816689072284962e-06, - "loss": 0.0121, - "step": 1129 - }, - { - "epoch": 6.932515337423313, - "grad_norm": 1.6503472328186035, - "learning_rate": 1.0777036505307616e-06, - "loss": 0.0056, - "step": 1130 - }, - { - "epoch": 6.938650306748467, - "grad_norm": 2.600112199783325, - "learning_rate": 1.0737436772452602e-06, - "loss": 0.0198, - "step": 1131 - }, - { - "epoch": 6.9447852760736195, - "grad_norm": 1.6668883562088013, - "learning_rate": 1.0697890020821292e-06, - "loss": 0.0077, - "step": 1132 - }, - { - "epoch": 6.950920245398773, - "grad_norm": 2.729172706604004, - "learning_rate": 1.0658396397318203e-06, - "loss": 0.0329, - "step": 1133 - }, - { - "epoch": 6.957055214723926, - "grad_norm": 1.5219136476516724, - "learning_rate": 1.061895604865053e-06, - "loss": 0.0113, - "step": 1134 - }, - { - "epoch": 6.96319018404908, - "grad_norm": 3.8395588397979736, - "learning_rate": 1.057956912132757e-06, - "loss": 0.0376, - "step": 1135 - }, - { - "epoch": 6.969325153374233, - "grad_norm": 2.4347221851348877, - "learning_rate": 1.054023576166014e-06, - "loss": 0.0517, - "step": 1136 - }, - { - "epoch": 6.975460122699387, - "grad_norm": 3.079165458679199, - "learning_rate": 1.0500956115760105e-06, - "loss": 0.0373, - "step": 1137 - }, - { - "epoch": 6.9815950920245395, - "grad_norm": 1.9391908645629883, - "learning_rate": 1.0461730329539794e-06, - "loss": 0.019, - "step": 1138 - }, - { - "epoch": 6.987730061349693, - "grad_norm": 1.8693119287490845, - "learning_rate": 1.0422558548711434e-06, - "loss": 0.0073, - "step": 1139 - }, - { - "epoch": 6.993865030674847, - "grad_norm": 3.0920307636260986, - "learning_rate": 1.0383440918786684e-06, - "loss": 0.0099, - "step": 1140 - }, - { - "epoch": 7.0, - "grad_norm": 3.184906244277954, - "learning_rate": 1.0344377585076e-06, - "loss": 0.0218, - "step": 1141 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 2.8263106664726528e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-1304/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1304/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1304/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00001-of-00007.safetensors deleted file mode 100644 index 6da023acab8f2d076504db22c21242a3cbe73929..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7bde6fd72ba42b7eb5abe4a28f2f782f302ea10743558533250804f152dc2e10 -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00002-of-00007.safetensors deleted file mode 100644 index cb921f5082b4e87acb7f87067fc62581c465004e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4318b5819e601280708ae4e3c050f4ff3ee42972762e0c80f211d6b8e173a850 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00003-of-00007.safetensors deleted file mode 100644 index 35455115411849df2e3155e7bd84ae67cd76c3f8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dbfbd4368751a91dd5d4216faeb3a2fc54f8c58a6aa023e94916a0cfd0c6e559 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00004-of-00007.safetensors deleted file mode 100644 index 8e55bae06ce8c28796c4e880c303ff9df92a4f89..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6184e6897874473891abb5f04733f13ec327b78b7d4113ddc02457fcc67c206 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00005-of-00007.safetensors deleted file mode 100644 index b82e1c4c2387a1586c89ac5f397007a472e003f3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:262790bfd4fe83d404ee23e97a8a0835a65e011dcc4b39105fffaa55b477f262 -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00006-of-00007.safetensors deleted file mode 100644 index 3df2af1f569ce2da003dccce1642f0ff8ed18ef3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62c00892f84b980bf5ed93a47f1d4be1a3346b93645c6ea2c14a82ec9a9e0fc6 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00007-of-00007.safetensors deleted file mode 100644 index f7ba083baacf0fde9b775df49d5a6685f70175ba..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e238296af89eccf17063216f2415694bfbf8346cc537c7483ffa6acdc88547fc -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-1304/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_0.pth deleted file mode 100644 index b4f7aff8787e77abdd3de7299719c4c21fc26258..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ee97cd82dba4d425fdd8dfdb88d4a43d0d4b1979b5c81ab4a24914fb00d4f332 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_1.pth deleted file mode 100644 index 60e171edb0868d2d1932468dd935beea673dfb02..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91dad95440fb85dc4a31745642117165c1a72173b2e389679ea8c0b2b6fcd7e2 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_2.pth deleted file mode 100644 index 719d1d591f4eba9f3f0ae8eb275150361dde6d12..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98698326b023c2af02c94f18726ce52c7f7a6fe290734dd7edbe99bc807fcfa0 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_3.pth deleted file mode 100644 index 45dc07c6b18b85ced4b0a4155cac795581cc18a5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:708e7c6b5bf8a327e688779ebc08830ce249928bcb1ff5c82b1b1d0bf6d2660b -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-1304/scheduler.pt deleted file mode 100644 index 7aade6b6e89e179ea3402ac30a9f5d5f0bddb6f2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:31af0d134354b9c017a4bd7a1a61a8cd1d3334dc0dcc2668bcc19dc4f57c0658 -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-1304/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-1304/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1304/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1304/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-1304/trainer_state.json deleted file mode 100644 index 8f0af7eb6b1ad35c3ab5bb85796536db016098c3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1304/trainer_state.json +++ /dev/null @@ -1,9162 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 8.0, - "eval_steps": 500, - "global_step": 1304, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - }, - { - "epoch": 3.0061349693251533, - "grad_norm": 5.331607818603516, - "learning_rate": 3.969463130731183e-06, - "loss": 0.3267, - "step": 490 - }, - { - "epoch": 3.0122699386503067, - "grad_norm": 3.453650712966919, - "learning_rate": 3.965562241492401e-06, - "loss": 0.2719, - "step": 491 - }, - { - "epoch": 3.01840490797546, - "grad_norm": 3.232313632965088, - "learning_rate": 3.9616559081213335e-06, - "loss": 0.1825, - "step": 492 - }, - { - "epoch": 3.0245398773006134, - "grad_norm": 3.4860260486602783, - "learning_rate": 3.957744145128858e-06, - "loss": 0.1854, - "step": 493 - }, - { - "epoch": 3.0306748466257667, - "grad_norm": 3.4357805252075195, - "learning_rate": 3.953826967046021e-06, - "loss": 0.2224, - "step": 494 - }, - { - "epoch": 3.03680981595092, - "grad_norm": 4.557503700256348, - "learning_rate": 3.9499043884239894e-06, - "loss": 0.349, - "step": 495 - }, - { - "epoch": 3.042944785276074, - "grad_norm": 4.685214042663574, - "learning_rate": 3.945976423833987e-06, - "loss": 0.175, - "step": 496 - }, - { - "epoch": 3.049079754601227, - "grad_norm": 3.7430171966552734, - "learning_rate": 3.942043087867244e-06, - "loss": 0.2773, - "step": 497 - }, - { - "epoch": 3.0552147239263805, - "grad_norm": 3.756450653076172, - "learning_rate": 3.938104395134947e-06, - "loss": 0.4445, - "step": 498 - }, - { - "epoch": 3.061349693251534, - "grad_norm": 4.049175262451172, - "learning_rate": 3.9341603602681805e-06, - "loss": 0.3046, - "step": 499 - }, - { - "epoch": 3.067484662576687, - "grad_norm": 3.7689461708068848, - "learning_rate": 3.930210997917871e-06, - "loss": 0.2544, - "step": 500 - }, - { - "epoch": 3.0736196319018405, - "grad_norm": 4.027602195739746, - "learning_rate": 3.92625632275474e-06, - "loss": 0.3154, - "step": 501 - }, - { - "epoch": 3.079754601226994, - "grad_norm": 2.8449292182922363, - "learning_rate": 3.922296349469239e-06, - "loss": 0.2804, - "step": 502 - }, - { - "epoch": 3.085889570552147, - "grad_norm": 2.9555234909057617, - "learning_rate": 3.918331092771505e-06, - "loss": 0.2393, - "step": 503 - }, - { - "epoch": 3.0920245398773005, - "grad_norm": 2.621042013168335, - "learning_rate": 3.914360567391296e-06, - "loss": 0.1403, - "step": 504 - }, - { - "epoch": 3.098159509202454, - "grad_norm": 3.2348620891571045, - "learning_rate": 3.910384788077949e-06, - "loss": 0.1537, - "step": 505 - }, - { - "epoch": 3.104294478527607, - "grad_norm": 3.030179977416992, - "learning_rate": 3.906403769600311e-06, - "loss": 0.2921, - "step": 506 - }, - { - "epoch": 3.1104294478527605, - "grad_norm": 3.146428346633911, - "learning_rate": 3.902417526746694e-06, - "loss": 0.2036, - "step": 507 - }, - { - "epoch": 3.116564417177914, - "grad_norm": 3.6201512813568115, - "learning_rate": 3.898426074324818e-06, - "loss": 0.2655, - "step": 508 - }, - { - "epoch": 3.1226993865030677, - "grad_norm": 3.7674012184143066, - "learning_rate": 3.8944294271617524e-06, - "loss": 0.3938, - "step": 509 - }, - { - "epoch": 3.128834355828221, - "grad_norm": 4.54722785949707, - "learning_rate": 3.890427600103865e-06, - "loss": 0.3051, - "step": 510 - }, - { - "epoch": 3.1349693251533743, - "grad_norm": 4.228236675262451, - "learning_rate": 3.886420608016767e-06, - "loss": 0.3719, - "step": 511 - }, - { - "epoch": 3.1411042944785277, - "grad_norm": 4.355110168457031, - "learning_rate": 3.882408465785252e-06, - "loss": 0.1863, - "step": 512 - }, - { - "epoch": 3.147239263803681, - "grad_norm": 3.451460838317871, - "learning_rate": 3.878391188313249e-06, - "loss": 0.1479, - "step": 513 - }, - { - "epoch": 3.1533742331288344, - "grad_norm": 4.395524501800537, - "learning_rate": 3.87436879052376e-06, - "loss": 0.238, - "step": 514 - }, - { - "epoch": 3.1595092024539877, - "grad_norm": 2.940717935562134, - "learning_rate": 3.870341287358809e-06, - "loss": 0.2069, - "step": 515 - }, - { - "epoch": 3.165644171779141, - "grad_norm": 2.5817320346832275, - "learning_rate": 3.8663086937793845e-06, - "loss": 0.1189, - "step": 516 - }, - { - "epoch": 3.1717791411042944, - "grad_norm": 3.9863343238830566, - "learning_rate": 3.862271024765385e-06, - "loss": 0.3434, - "step": 517 - }, - { - "epoch": 3.1779141104294477, - "grad_norm": 3.609004259109497, - "learning_rate": 3.8582282953155626e-06, - "loss": 0.1602, - "step": 518 - }, - { - "epoch": 3.184049079754601, - "grad_norm": 3.207533121109009, - "learning_rate": 3.854180520447465e-06, - "loss": 0.3452, - "step": 519 - }, - { - "epoch": 3.190184049079755, - "grad_norm": 3.593388795852661, - "learning_rate": 3.850127715197387e-06, - "loss": 0.2832, - "step": 520 - }, - { - "epoch": 3.196319018404908, - "grad_norm": 3.409064531326294, - "learning_rate": 3.846069894620306e-06, - "loss": 0.1481, - "step": 521 - }, - { - "epoch": 3.2024539877300615, - "grad_norm": 3.461498737335205, - "learning_rate": 3.84200707378983e-06, - "loss": 0.1283, - "step": 522 - }, - { - "epoch": 3.208588957055215, - "grad_norm": 3.708467483520508, - "learning_rate": 3.8379392677981434e-06, - "loss": 0.2468, - "step": 523 - }, - { - "epoch": 3.214723926380368, - "grad_norm": 2.802381753921509, - "learning_rate": 3.833866491755947e-06, - "loss": 0.2685, - "step": 524 - }, - { - "epoch": 3.2208588957055215, - "grad_norm": 3.0787744522094727, - "learning_rate": 3.8297887607924044e-06, - "loss": 0.2595, - "step": 525 - }, - { - "epoch": 3.226993865030675, - "grad_norm": 3.3952548503875732, - "learning_rate": 3.825706090055088e-06, - "loss": 0.4099, - "step": 526 - }, - { - "epoch": 3.233128834355828, - "grad_norm": 3.3497085571289062, - "learning_rate": 3.821618494709916e-06, - "loss": 0.287, - "step": 527 - }, - { - "epoch": 3.2392638036809815, - "grad_norm": 4.050611972808838, - "learning_rate": 3.817525989941102e-06, - "loss": 0.2369, - "step": 528 - }, - { - "epoch": 3.245398773006135, - "grad_norm": 2.87642240524292, - "learning_rate": 3.8134285909510972e-06, - "loss": 0.2751, - "step": 529 - }, - { - "epoch": 3.2515337423312882, - "grad_norm": 3.821941614151001, - "learning_rate": 3.8093263129605305e-06, - "loss": 0.2363, - "step": 530 - }, - { - "epoch": 3.2576687116564416, - "grad_norm": 2.8066117763519287, - "learning_rate": 3.80521917120816e-06, - "loss": 0.094, - "step": 531 - }, - { - "epoch": 3.263803680981595, - "grad_norm": 3.849768877029419, - "learning_rate": 3.801107180950806e-06, - "loss": 0.4117, - "step": 532 - }, - { - "epoch": 3.2699386503067487, - "grad_norm": 2.4161250591278076, - "learning_rate": 3.7969903574633028e-06, - "loss": 0.1183, - "step": 533 - }, - { - "epoch": 3.276073619631902, - "grad_norm": 3.6743111610412598, - "learning_rate": 3.792868716038437e-06, - "loss": 0.2296, - "step": 534 - }, - { - "epoch": 3.2822085889570554, - "grad_norm": 4.378123760223389, - "learning_rate": 3.7887422719868937e-06, - "loss": 0.2678, - "step": 535 - }, - { - "epoch": 3.2883435582822087, - "grad_norm": 4.816481590270996, - "learning_rate": 3.784611040637198e-06, - "loss": 0.4887, - "step": 536 - }, - { - "epoch": 3.294478527607362, - "grad_norm": 3.5712430477142334, - "learning_rate": 3.7804750373356576e-06, - "loss": 0.3827, - "step": 537 - }, - { - "epoch": 3.3006134969325154, - "grad_norm": 3.6877355575561523, - "learning_rate": 3.776334277446307e-06, - "loss": 0.3233, - "step": 538 - }, - { - "epoch": 3.3067484662576687, - "grad_norm": 3.442706346511841, - "learning_rate": 3.7721887763508512e-06, - "loss": 0.1256, - "step": 539 - }, - { - "epoch": 3.312883435582822, - "grad_norm": 3.9265615940093994, - "learning_rate": 3.7680385494486053e-06, - "loss": 0.3845, - "step": 540 - }, - { - "epoch": 3.3190184049079754, - "grad_norm": 3.5030126571655273, - "learning_rate": 3.7638836121564414e-06, - "loss": 0.2905, - "step": 541 - }, - { - "epoch": 3.3251533742331287, - "grad_norm": 3.6685378551483154, - "learning_rate": 3.7597239799087283e-06, - "loss": 0.3561, - "step": 542 - }, - { - "epoch": 3.331288343558282, - "grad_norm": 3.8484046459198, - "learning_rate": 3.7555596681572736e-06, - "loss": 0.1157, - "step": 543 - }, - { - "epoch": 3.3374233128834354, - "grad_norm": 3.7977402210235596, - "learning_rate": 3.751390692371272e-06, - "loss": 0.3049, - "step": 544 - }, - { - "epoch": 3.3435582822085887, - "grad_norm": 3.4409852027893066, - "learning_rate": 3.7472170680372398e-06, - "loss": 0.1626, - "step": 545 - }, - { - "epoch": 3.3496932515337425, - "grad_norm": 3.801541328430176, - "learning_rate": 3.7430388106589632e-06, - "loss": 0.2414, - "step": 546 - }, - { - "epoch": 3.355828220858896, - "grad_norm": 4.025203704833984, - "learning_rate": 3.738855935757438e-06, - "loss": 0.3441, - "step": 547 - }, - { - "epoch": 3.361963190184049, - "grad_norm": 4.242798805236816, - "learning_rate": 3.7346684588708135e-06, - "loss": 0.5244, - "step": 548 - }, - { - "epoch": 3.3680981595092025, - "grad_norm": 3.0516819953918457, - "learning_rate": 3.7304763955543332e-06, - "loss": 0.1984, - "step": 549 - }, - { - "epoch": 3.374233128834356, - "grad_norm": 3.894667625427246, - "learning_rate": 3.726279761380279e-06, - "loss": 0.2715, - "step": 550 - }, - { - "epoch": 3.3803680981595092, - "grad_norm": 3.171208143234253, - "learning_rate": 3.72207857193791e-06, - "loss": 0.1537, - "step": 551 - }, - { - "epoch": 3.3865030674846626, - "grad_norm": 4.344860553741455, - "learning_rate": 3.7178728428334092e-06, - "loss": 0.2388, - "step": 552 - }, - { - "epoch": 3.392638036809816, - "grad_norm": 2.766317367553711, - "learning_rate": 3.7136625896898226e-06, - "loss": 0.1726, - "step": 553 - }, - { - "epoch": 3.3987730061349692, - "grad_norm": 3.550662040710449, - "learning_rate": 3.7094478281470003e-06, - "loss": 0.2942, - "step": 554 - }, - { - "epoch": 3.4049079754601226, - "grad_norm": 3.4576945304870605, - "learning_rate": 3.7052285738615412e-06, - "loss": 0.1665, - "step": 555 - }, - { - "epoch": 3.411042944785276, - "grad_norm": 4.026793003082275, - "learning_rate": 3.7010048425067317e-06, - "loss": 0.3954, - "step": 556 - }, - { - "epoch": 3.4171779141104293, - "grad_norm": 4.600133419036865, - "learning_rate": 3.696776649772492e-06, - "loss": 0.3207, - "step": 557 - }, - { - "epoch": 3.4233128834355826, - "grad_norm": 4.747331142425537, - "learning_rate": 3.692544011365312e-06, - "loss": 0.1325, - "step": 558 - }, - { - "epoch": 3.4294478527607364, - "grad_norm": 3.781464099884033, - "learning_rate": 3.6883069430081986e-06, - "loss": 0.1644, - "step": 559 - }, - { - "epoch": 3.4355828220858897, - "grad_norm": 2.905986785888672, - "learning_rate": 3.6840654604406135e-06, - "loss": 0.2469, - "step": 560 - }, - { - "epoch": 3.441717791411043, - "grad_norm": 2.3747711181640625, - "learning_rate": 3.679819579418414e-06, - "loss": 0.1146, - "step": 561 - }, - { - "epoch": 3.4478527607361964, - "grad_norm": 3.2683632373809814, - "learning_rate": 3.6755693157137995e-06, - "loss": 0.3236, - "step": 562 - }, - { - "epoch": 3.4539877300613497, - "grad_norm": 3.7750496864318848, - "learning_rate": 3.6713146851152487e-06, - "loss": 0.399, - "step": 563 - }, - { - "epoch": 3.460122699386503, - "grad_norm": 3.3912384510040283, - "learning_rate": 3.667055703427461e-06, - "loss": 0.1259, - "step": 564 - }, - { - "epoch": 3.4662576687116564, - "grad_norm": 3.0224430561065674, - "learning_rate": 3.6627923864713e-06, - "loss": 0.1835, - "step": 565 - }, - { - "epoch": 3.4723926380368098, - "grad_norm": 3.642258405685425, - "learning_rate": 3.658524750083733e-06, - "loss": 0.2763, - "step": 566 - }, - { - "epoch": 3.478527607361963, - "grad_norm": 3.409890651702881, - "learning_rate": 3.654252810117773e-06, - "loss": 0.2496, - "step": 567 - }, - { - "epoch": 3.4846625766871164, - "grad_norm": 3.0416476726531982, - "learning_rate": 3.6499765824424195e-06, - "loss": 0.1287, - "step": 568 - }, - { - "epoch": 3.4907975460122698, - "grad_norm": 3.1963987350463867, - "learning_rate": 3.6456960829425987e-06, - "loss": 0.1747, - "step": 569 - }, - { - "epoch": 3.4969325153374236, - "grad_norm": 3.198448657989502, - "learning_rate": 3.641411327519107e-06, - "loss": 0.1913, - "step": 570 - }, - { - "epoch": 3.5030674846625764, - "grad_norm": 3.7023441791534424, - "learning_rate": 3.6371223320885492e-06, - "loss": 0.3224, - "step": 571 - }, - { - "epoch": 3.5092024539877302, - "grad_norm": 4.54288387298584, - "learning_rate": 3.6328291125832803e-06, - "loss": 0.2364, - "step": 572 - }, - { - "epoch": 3.5153374233128836, - "grad_norm": 3.5064890384674072, - "learning_rate": 3.628531684951347e-06, - "loss": 0.2552, - "step": 573 - }, - { - "epoch": 3.521472392638037, - "grad_norm": 3.987583875656128, - "learning_rate": 3.6242300651564276e-06, - "loss": 0.3232, - "step": 574 - }, - { - "epoch": 3.5276073619631902, - "grad_norm": 3.179642915725708, - "learning_rate": 3.6199242691777745e-06, - "loss": 0.32, - "step": 575 - }, - { - "epoch": 3.5337423312883436, - "grad_norm": 3.3078157901763916, - "learning_rate": 3.6156143130101516e-06, - "loss": 0.2922, - "step": 576 - }, - { - "epoch": 3.539877300613497, - "grad_norm": 3.1628613471984863, - "learning_rate": 3.6113002126637765e-06, - "loss": 0.2005, - "step": 577 - }, - { - "epoch": 3.5460122699386503, - "grad_norm": 3.4515540599823, - "learning_rate": 3.606981984164263e-06, - "loss": 0.2138, - "step": 578 - }, - { - "epoch": 3.5521472392638036, - "grad_norm": 5.132473945617676, - "learning_rate": 3.6026596435525578e-06, - "loss": 0.4382, - "step": 579 - }, - { - "epoch": 3.558282208588957, - "grad_norm": 3.397614002227783, - "learning_rate": 3.5983332068848855e-06, - "loss": 0.3326, - "step": 580 - }, - { - "epoch": 3.5644171779141103, - "grad_norm": 4.79497766494751, - "learning_rate": 3.5940026902326825e-06, - "loss": 0.4748, - "step": 581 - }, - { - "epoch": 3.5705521472392636, - "grad_norm": 3.7675018310546875, - "learning_rate": 3.5896681096825446e-06, - "loss": 0.2692, - "step": 582 - }, - { - "epoch": 3.5766871165644174, - "grad_norm": 3.0637521743774414, - "learning_rate": 3.5853294813361614e-06, - "loss": 0.3658, - "step": 583 - }, - { - "epoch": 3.5828220858895703, - "grad_norm": 2.8949790000915527, - "learning_rate": 3.5809868213102623e-06, - "loss": 0.1661, - "step": 584 - }, - { - "epoch": 3.588957055214724, - "grad_norm": 3.163419246673584, - "learning_rate": 3.5766401457365485e-06, - "loss": 0.1233, - "step": 585 - }, - { - "epoch": 3.5950920245398774, - "grad_norm": 3.1787965297698975, - "learning_rate": 3.5722894707616417e-06, - "loss": 0.278, - "step": 586 - }, - { - "epoch": 3.6012269938650308, - "grad_norm": 2.9397857189178467, - "learning_rate": 3.5679348125470175e-06, - "loss": 0.1541, - "step": 587 - }, - { - "epoch": 3.607361963190184, - "grad_norm": 3.2690396308898926, - "learning_rate": 3.56357618726895e-06, - "loss": 0.1575, - "step": 588 - }, - { - "epoch": 3.6134969325153374, - "grad_norm": 5.444014072418213, - "learning_rate": 3.5592136111184483e-06, - "loss": 0.8079, - "step": 589 - }, - { - "epoch": 3.6196319018404908, - "grad_norm": 3.1688313484191895, - "learning_rate": 3.554847100301199e-06, - "loss": 0.341, - "step": 590 - }, - { - "epoch": 3.625766871165644, - "grad_norm": 2.469212532043457, - "learning_rate": 3.550476671037505e-06, - "loss": 0.1625, - "step": 591 - }, - { - "epoch": 3.6319018404907975, - "grad_norm": 3.3956527709960938, - "learning_rate": 3.546102339562223e-06, - "loss": 0.199, - "step": 592 - }, - { - "epoch": 3.638036809815951, - "grad_norm": 2.7287702560424805, - "learning_rate": 3.5417241221247078e-06, - "loss": 0.1493, - "step": 593 - }, - { - "epoch": 3.644171779141104, - "grad_norm": 3.5046865940093994, - "learning_rate": 3.5373420349887477e-06, - "loss": 0.2765, - "step": 594 - }, - { - "epoch": 3.6503067484662575, - "grad_norm": 3.121476650238037, - "learning_rate": 3.5329560944325065e-06, - "loss": 0.2833, - "step": 595 - }, - { - "epoch": 3.6564417177914113, - "grad_norm": 3.276463270187378, - "learning_rate": 3.528566316748462e-06, - "loss": 0.1237, - "step": 596 - }, - { - "epoch": 3.662576687116564, - "grad_norm": 3.382840633392334, - "learning_rate": 3.524172718243347e-06, - "loss": 0.1599, - "step": 597 - }, - { - "epoch": 3.668711656441718, - "grad_norm": 4.801311492919922, - "learning_rate": 3.5197753152380854e-06, - "loss": 0.2997, - "step": 598 - }, - { - "epoch": 3.6748466257668713, - "grad_norm": 4.117336273193359, - "learning_rate": 3.515374124067736e-06, - "loss": 0.2021, - "step": 599 - }, - { - "epoch": 3.6809815950920246, - "grad_norm": 3.611438035964966, - "learning_rate": 3.5109691610814263e-06, - "loss": 0.1726, - "step": 600 - }, - { - "epoch": 3.687116564417178, - "grad_norm": 4.5179972648620605, - "learning_rate": 3.5065604426422995e-06, - "loss": 0.1377, - "step": 601 - }, - { - "epoch": 3.6932515337423313, - "grad_norm": 3.561061382293701, - "learning_rate": 3.502147985127445e-06, - "loss": 0.1497, - "step": 602 - }, - { - "epoch": 3.6993865030674846, - "grad_norm": 3.3497917652130127, - "learning_rate": 3.4977318049278443e-06, - "loss": 0.1589, - "step": 603 - }, - { - "epoch": 3.705521472392638, - "grad_norm": 3.2725470066070557, - "learning_rate": 3.4933119184483065e-06, - "loss": 0.1364, - "step": 604 - }, - { - "epoch": 3.7116564417177913, - "grad_norm": 3.228956460952759, - "learning_rate": 3.4888883421074076e-06, - "loss": 0.177, - "step": 605 - }, - { - "epoch": 3.7177914110429446, - "grad_norm": 3.7648911476135254, - "learning_rate": 3.484461092337434e-06, - "loss": 0.122, - "step": 606 - }, - { - "epoch": 3.7239263803680984, - "grad_norm": 3.5322585105895996, - "learning_rate": 3.4800301855843137e-06, - "loss": 0.2664, - "step": 607 - }, - { - "epoch": 3.7300613496932513, - "grad_norm": 2.951073169708252, - "learning_rate": 3.4755956383075613e-06, - "loss": 0.12, - "step": 608 - }, - { - "epoch": 3.736196319018405, - "grad_norm": 3.0577664375305176, - "learning_rate": 3.471157466980214e-06, - "loss": 0.3926, - "step": 609 - }, - { - "epoch": 3.7423312883435584, - "grad_norm": 4.089846134185791, - "learning_rate": 3.466715688088772e-06, - "loss": 0.6233, - "step": 610 - }, - { - "epoch": 3.7484662576687118, - "grad_norm": 3.081340789794922, - "learning_rate": 3.462270318133136e-06, - "loss": 0.2456, - "step": 611 - }, - { - "epoch": 3.754601226993865, - "grad_norm": 3.034712553024292, - "learning_rate": 3.4578213736265474e-06, - "loss": 0.2683, - "step": 612 - }, - { - "epoch": 3.7607361963190185, - "grad_norm": 3.459815740585327, - "learning_rate": 3.4533688710955255e-06, - "loss": 0.3796, - "step": 613 - }, - { - "epoch": 3.766871165644172, - "grad_norm": 3.523737907409668, - "learning_rate": 3.448912827079805e-06, - "loss": 0.3326, - "step": 614 - }, - { - "epoch": 3.773006134969325, - "grad_norm": 3.333219289779663, - "learning_rate": 3.4444532581322793e-06, - "loss": 0.206, - "step": 615 - }, - { - "epoch": 3.7791411042944785, - "grad_norm": 3.582387685775757, - "learning_rate": 3.4399901808189327e-06, - "loss": 0.244, - "step": 616 - }, - { - "epoch": 3.785276073619632, - "grad_norm": 3.4887266159057617, - "learning_rate": 3.435523611718785e-06, - "loss": 0.1796, - "step": 617 - }, - { - "epoch": 3.791411042944785, - "grad_norm": 4.89408016204834, - "learning_rate": 3.4310535674238242e-06, - "loss": 0.188, - "step": 618 - }, - { - "epoch": 3.7975460122699385, - "grad_norm": 4.338910102844238, - "learning_rate": 3.42658006453895e-06, - "loss": 0.3039, - "step": 619 - }, - { - "epoch": 3.8036809815950923, - "grad_norm": 4.107708930969238, - "learning_rate": 3.4221031196819083e-06, - "loss": 0.3383, - "step": 620 - }, - { - "epoch": 3.809815950920245, - "grad_norm": 3.698777675628662, - "learning_rate": 3.4176227494832305e-06, - "loss": 0.1721, - "step": 621 - }, - { - "epoch": 3.815950920245399, - "grad_norm": 2.6659226417541504, - "learning_rate": 3.413138970586174e-06, - "loss": 0.2211, - "step": 622 - }, - { - "epoch": 3.8220858895705523, - "grad_norm": 3.2398436069488525, - "learning_rate": 3.4086517996466574e-06, - "loss": 0.1871, - "step": 623 - }, - { - "epoch": 3.8282208588957056, - "grad_norm": 4.9128804206848145, - "learning_rate": 3.404161253333199e-06, - "loss": 0.3874, - "step": 624 - }, - { - "epoch": 3.834355828220859, - "grad_norm": 3.508789300918579, - "learning_rate": 3.3996673483268573e-06, - "loss": 0.1739, - "step": 625 - }, - { - "epoch": 3.8404907975460123, - "grad_norm": 3.3016927242279053, - "learning_rate": 3.3951701013211665e-06, - "loss": 0.274, - "step": 626 - }, - { - "epoch": 3.8466257668711656, - "grad_norm": 3.8941333293914795, - "learning_rate": 3.3906695290220736e-06, - "loss": 0.3568, - "step": 627 - }, - { - "epoch": 3.852760736196319, - "grad_norm": 3.512354850769043, - "learning_rate": 3.3861656481478816e-06, - "loss": 0.157, - "step": 628 - }, - { - "epoch": 3.8588957055214723, - "grad_norm": 3.482649326324463, - "learning_rate": 3.3816584754291814e-06, - "loss": 0.1218, - "step": 629 - }, - { - "epoch": 3.8650306748466257, - "grad_norm": 3.1490275859832764, - "learning_rate": 3.377148027608793e-06, - "loss": 0.2234, - "step": 630 - }, - { - "epoch": 3.871165644171779, - "grad_norm": 3.2172653675079346, - "learning_rate": 3.3726343214417023e-06, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 3.8773006134969323, - "grad_norm": 4.167707443237305, - "learning_rate": 3.3681173736949984e-06, - "loss": 0.1384, - "step": 632 - }, - { - "epoch": 3.883435582822086, - "grad_norm": 3.4743919372558594, - "learning_rate": 3.3635972011478134e-06, - "loss": 0.3807, - "step": 633 - }, - { - "epoch": 3.889570552147239, - "grad_norm": 3.6892173290252686, - "learning_rate": 3.3590738205912566e-06, - "loss": 0.194, - "step": 634 - }, - { - "epoch": 3.895705521472393, - "grad_norm": 3.262967824935913, - "learning_rate": 3.354547248828356e-06, - "loss": 0.202, - "step": 635 - }, - { - "epoch": 3.901840490797546, - "grad_norm": 3.8871562480926514, - "learning_rate": 3.3500175026739916e-06, - "loss": 0.2471, - "step": 636 - }, - { - "epoch": 3.9079754601226995, - "grad_norm": 3.5097084045410156, - "learning_rate": 3.3454845989548385e-06, - "loss": 0.1112, - "step": 637 - }, - { - "epoch": 3.914110429447853, - "grad_norm": 4.163944721221924, - "learning_rate": 3.3409485545092995e-06, - "loss": 0.3368, - "step": 638 - }, - { - "epoch": 3.920245398773006, - "grad_norm": 3.6405045986175537, - "learning_rate": 3.336409386187444e-06, - "loss": 0.1863, - "step": 639 - }, - { - "epoch": 3.9263803680981595, - "grad_norm": 3.2477526664733887, - "learning_rate": 3.331867110850946e-06, - "loss": 0.1491, - "step": 640 - }, - { - "epoch": 3.932515337423313, - "grad_norm": 3.933753490447998, - "learning_rate": 3.327321745373021e-06, - "loss": 0.2484, - "step": 641 - }, - { - "epoch": 3.938650306748466, - "grad_norm": 3.2475059032440186, - "learning_rate": 3.322773306638364e-06, - "loss": 0.2126, - "step": 642 - }, - { - "epoch": 3.9447852760736195, - "grad_norm": 2.628467321395874, - "learning_rate": 3.318221811543086e-06, - "loss": 0.1649, - "step": 643 - }, - { - "epoch": 3.950920245398773, - "grad_norm": 3.2612411975860596, - "learning_rate": 3.313667276994651e-06, - "loss": 0.1442, - "step": 644 - }, - { - "epoch": 3.957055214723926, - "grad_norm": 3.8058395385742188, - "learning_rate": 3.309109719911814e-06, - "loss": 0.359, - "step": 645 - }, - { - "epoch": 3.96319018404908, - "grad_norm": 3.3450071811676025, - "learning_rate": 3.304549157224558e-06, - "loss": 0.4042, - "step": 646 - }, - { - "epoch": 3.969325153374233, - "grad_norm": 3.079601287841797, - "learning_rate": 3.299985605874031e-06, - "loss": 0.1699, - "step": 647 - }, - { - "epoch": 3.9754601226993866, - "grad_norm": 3.8963980674743652, - "learning_rate": 3.295419082812483e-06, - "loss": 0.1888, - "step": 648 - }, - { - "epoch": 3.98159509202454, - "grad_norm": 3.307405948638916, - "learning_rate": 3.2908496050032024e-06, - "loss": 0.2824, - "step": 649 - }, - { - "epoch": 3.9877300613496933, - "grad_norm": 3.227478265762329, - "learning_rate": 3.2862771894204544e-06, - "loss": 0.3038, - "step": 650 - }, - { - "epoch": 3.9938650306748467, - "grad_norm": 4.046506881713867, - "learning_rate": 3.2817018530494164e-06, - "loss": 0.3266, - "step": 651 - }, - { - "epoch": 4.0, - "grad_norm": 7.775874614715576, - "learning_rate": 3.277123612886116e-06, - "loss": 0.2998, - "step": 652 - }, - { - "epoch": 4.006134969325154, - "grad_norm": 3.146462917327881, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2764, - "step": 653 - }, - { - "epoch": 4.012269938650307, - "grad_norm": 3.0539863109588623, - "learning_rate": 3.2679584892207118e-06, - "loss": 0.1157, - "step": 654 - }, - { - "epoch": 4.0184049079754605, - "grad_norm": 3.634021520614624, - "learning_rate": 3.263371639764343e-06, - "loss": 0.0707, - "step": 655 - }, - { - "epoch": 4.024539877300613, - "grad_norm": 3.3474650382995605, - "learning_rate": 3.2587819546070596e-06, - "loss": 0.1067, - "step": 656 - }, - { - "epoch": 4.030674846625767, - "grad_norm": 4.409244537353516, - "learning_rate": 3.254189450798189e-06, - "loss": 0.0564, - "step": 657 - }, - { - "epoch": 4.03680981595092, - "grad_norm": 3.0446252822875977, - "learning_rate": 3.2495941453975312e-06, - "loss": 0.0535, - "step": 658 - }, - { - "epoch": 4.042944785276074, - "grad_norm": 4.014753818511963, - "learning_rate": 3.2449960554752935e-06, - "loss": 0.1245, - "step": 659 - }, - { - "epoch": 4.049079754601227, - "grad_norm": 3.188062906265259, - "learning_rate": 3.240395198112026e-06, - "loss": 0.0626, - "step": 660 - }, - { - "epoch": 4.0552147239263805, - "grad_norm": 3.006086826324463, - "learning_rate": 3.2357915903985605e-06, - "loss": 0.1198, - "step": 661 - }, - { - "epoch": 4.061349693251533, - "grad_norm": 2.8865551948547363, - "learning_rate": 3.2311852494359423e-06, - "loss": 0.0454, - "step": 662 - }, - { - "epoch": 4.067484662576687, - "grad_norm": 4.2888007164001465, - "learning_rate": 3.226576192335373e-06, - "loss": 0.2064, - "step": 663 - }, - { - "epoch": 4.07361963190184, - "grad_norm": 3.1414525508880615, - "learning_rate": 3.2219644362181436e-06, - "loss": 0.2183, - "step": 664 - }, - { - "epoch": 4.079754601226994, - "grad_norm": 2.556277275085449, - "learning_rate": 3.21734999821557e-06, - "loss": 0.0516, - "step": 665 - }, - { - "epoch": 4.085889570552148, - "grad_norm": 2.698118209838867, - "learning_rate": 3.2127328954689307e-06, - "loss": 0.0613, - "step": 666 - }, - { - "epoch": 4.0920245398773005, - "grad_norm": 2.869919538497925, - "learning_rate": 3.2081131451294025e-06, - "loss": 0.0583, - "step": 667 - }, - { - "epoch": 4.098159509202454, - "grad_norm": 3.8786919116973877, - "learning_rate": 3.2034907643579988e-06, - "loss": 0.0766, - "step": 668 - }, - { - "epoch": 4.104294478527607, - "grad_norm": 4.224637031555176, - "learning_rate": 3.1988657703255043e-06, - "loss": 0.1099, - "step": 669 - }, - { - "epoch": 4.110429447852761, - "grad_norm": 4.671669006347656, - "learning_rate": 3.194238180212409e-06, - "loss": 0.1663, - "step": 670 - }, - { - "epoch": 4.116564417177914, - "grad_norm": 3.2484257221221924, - "learning_rate": 3.1896080112088477e-06, - "loss": 0.0587, - "step": 671 - }, - { - "epoch": 4.122699386503068, - "grad_norm": 2.4808075428009033, - "learning_rate": 3.184975280514536e-06, - "loss": 0.0579, - "step": 672 - }, - { - "epoch": 4.128834355828221, - "grad_norm": 3.7106919288635254, - "learning_rate": 3.1803400053387044e-06, - "loss": 0.1083, - "step": 673 - }, - { - "epoch": 4.134969325153374, - "grad_norm": 3.008970260620117, - "learning_rate": 3.175702202900036e-06, - "loss": 0.1355, - "step": 674 - }, - { - "epoch": 4.141104294478527, - "grad_norm": 3.2640793323516846, - "learning_rate": 3.1710618904266006e-06, - "loss": 0.092, - "step": 675 - }, - { - "epoch": 4.147239263803681, - "grad_norm": 3.08042049407959, - "learning_rate": 3.166419085155793e-06, - "loss": 0.0563, - "step": 676 - }, - { - "epoch": 4.153374233128835, - "grad_norm": 2.993530511856079, - "learning_rate": 3.1617738043342695e-06, - "loss": 0.1773, - "step": 677 - }, - { - "epoch": 4.159509202453988, - "grad_norm": 2.6218204498291016, - "learning_rate": 3.157126065217879e-06, - "loss": 0.0489, - "step": 678 - }, - { - "epoch": 4.1656441717791415, - "grad_norm": 4.3173723220825195, - "learning_rate": 3.152475885071606e-06, - "loss": 0.1333, - "step": 679 - }, - { - "epoch": 4.171779141104294, - "grad_norm": 3.659149408340454, - "learning_rate": 3.147823281169498e-06, - "loss": 0.1501, - "step": 680 - }, - { - "epoch": 4.177914110429448, - "grad_norm": 3.0953338146209717, - "learning_rate": 3.143168270794612e-06, - "loss": 0.1067, - "step": 681 - }, - { - "epoch": 4.184049079754601, - "grad_norm": 3.5693907737731934, - "learning_rate": 3.1385108712389394e-06, - "loss": 0.2499, - "step": 682 - }, - { - "epoch": 4.190184049079755, - "grad_norm": 3.3022868633270264, - "learning_rate": 3.1338510998033484e-06, - "loss": 0.1748, - "step": 683 - }, - { - "epoch": 4.196319018404908, - "grad_norm": 3.7468113899230957, - "learning_rate": 3.129188973797519e-06, - "loss": 0.201, - "step": 684 - }, - { - "epoch": 4.2024539877300615, - "grad_norm": 2.8381078243255615, - "learning_rate": 3.124524510539875e-06, - "loss": 0.0735, - "step": 685 - }, - { - "epoch": 4.208588957055214, - "grad_norm": 2.84706974029541, - "learning_rate": 3.119857727357527e-06, - "loss": 0.1806, - "step": 686 - }, - { - "epoch": 4.214723926380368, - "grad_norm": 3.8130292892456055, - "learning_rate": 3.1151886415861993e-06, - "loss": 0.1811, - "step": 687 - }, - { - "epoch": 4.220858895705521, - "grad_norm": 3.528895378112793, - "learning_rate": 3.1105172705701708e-06, - "loss": 0.1634, - "step": 688 - }, - { - "epoch": 4.226993865030675, - "grad_norm": 5.028727054595947, - "learning_rate": 3.1058436316622103e-06, - "loss": 0.1625, - "step": 689 - }, - { - "epoch": 4.233128834355828, - "grad_norm": 4.606889247894287, - "learning_rate": 3.1011677422235093e-06, - "loss": 0.1791, - "step": 690 - }, - { - "epoch": 4.2392638036809815, - "grad_norm": 3.3620636463165283, - "learning_rate": 3.0964896196236217e-06, - "loss": 0.2233, - "step": 691 - }, - { - "epoch": 4.245398773006135, - "grad_norm": 3.7845852375030518, - "learning_rate": 3.0918092812403954e-06, - "loss": 0.1142, - "step": 692 - }, - { - "epoch": 4.251533742331288, - "grad_norm": 3.1204118728637695, - "learning_rate": 3.0871267444599098e-06, - "loss": 0.096, - "step": 693 - }, - { - "epoch": 4.257668711656442, - "grad_norm": 3.686067819595337, - "learning_rate": 3.0824420266764093e-06, - "loss": 0.2749, - "step": 694 - }, - { - "epoch": 4.263803680981595, - "grad_norm": 3.1680829524993896, - "learning_rate": 3.077755145292243e-06, - "loss": 0.2504, - "step": 695 - }, - { - "epoch": 4.269938650306749, - "grad_norm": 3.3179469108581543, - "learning_rate": 3.0730661177177957e-06, - "loss": 0.1324, - "step": 696 - }, - { - "epoch": 4.276073619631902, - "grad_norm": 3.1186370849609375, - "learning_rate": 3.0683749613714238e-06, - "loss": 0.0691, - "step": 697 - }, - { - "epoch": 4.282208588957055, - "grad_norm": 3.086834192276001, - "learning_rate": 3.063681693679391e-06, - "loss": 0.1026, - "step": 698 - }, - { - "epoch": 4.288343558282208, - "grad_norm": 4.629584312438965, - "learning_rate": 3.0589863320758063e-06, - "loss": 0.2646, - "step": 699 - }, - { - "epoch": 4.294478527607362, - "grad_norm": 3.9641213417053223, - "learning_rate": 3.0542888940025562e-06, - "loss": 0.1711, - "step": 700 - }, - { - "epoch": 4.300613496932515, - "grad_norm": 3.75014328956604, - "learning_rate": 3.0495893969092395e-06, - "loss": 0.0589, - "step": 701 - }, - { - "epoch": 4.306748466257669, - "grad_norm": 3.603290319442749, - "learning_rate": 3.044887858253105e-06, - "loss": 0.2244, - "step": 702 - }, - { - "epoch": 4.3128834355828225, - "grad_norm": 3.79404616355896, - "learning_rate": 3.040184295498984e-06, - "loss": 0.1506, - "step": 703 - }, - { - "epoch": 4.319018404907975, - "grad_norm": 3.0890021324157715, - "learning_rate": 3.035478726119228e-06, - "loss": 0.2343, - "step": 704 - }, - { - "epoch": 4.325153374233129, - "grad_norm": 3.6688191890716553, - "learning_rate": 3.0307711675936426e-06, - "loss": 0.0518, - "step": 705 - }, - { - "epoch": 4.331288343558282, - "grad_norm": 5.1836700439453125, - "learning_rate": 3.0260616374094208e-06, - "loss": 0.2363, - "step": 706 - }, - { - "epoch": 4.337423312883436, - "grad_norm": 2.7123284339904785, - "learning_rate": 3.0213501530610807e-06, - "loss": 0.0848, - "step": 707 - }, - { - "epoch": 4.343558282208589, - "grad_norm": 3.5661890506744385, - "learning_rate": 3.0166367320504005e-06, - "loss": 0.149, - "step": 708 - }, - { - "epoch": 4.3496932515337425, - "grad_norm": 3.6454737186431885, - "learning_rate": 3.0119213918863515e-06, - "loss": 0.1133, - "step": 709 - }, - { - "epoch": 4.355828220858895, - "grad_norm": 3.7534968852996826, - "learning_rate": 3.0072041500850343e-06, - "loss": 0.1358, - "step": 710 - }, - { - "epoch": 4.361963190184049, - "grad_norm": 3.40387225151062, - "learning_rate": 3.0024850241696128e-06, - "loss": 0.0706, - "step": 711 - }, - { - "epoch": 4.368098159509202, - "grad_norm": 3.250471591949463, - "learning_rate": 2.9977640316702512e-06, - "loss": 0.1977, - "step": 712 - }, - { - "epoch": 4.374233128834356, - "grad_norm": 3.417781352996826, - "learning_rate": 2.993041190124047e-06, - "loss": 0.2622, - "step": 713 - }, - { - "epoch": 4.38036809815951, - "grad_norm": 2.628434181213379, - "learning_rate": 2.9883165170749657e-06, - "loss": 0.1487, - "step": 714 - }, - { - "epoch": 4.386503067484663, - "grad_norm": 3.240264892578125, - "learning_rate": 2.9835900300737763e-06, - "loss": 0.0822, - "step": 715 - }, - { - "epoch": 4.392638036809816, - "grad_norm": 6.575517177581787, - "learning_rate": 2.9788617466779884e-06, - "loss": 0.3668, - "step": 716 - }, - { - "epoch": 4.398773006134969, - "grad_norm": 4.699089050292969, - "learning_rate": 2.974131684451781e-06, - "loss": 0.2432, - "step": 717 - }, - { - "epoch": 4.404907975460123, - "grad_norm": 2.9815752506256104, - "learning_rate": 2.9693998609659443e-06, - "loss": 0.0689, - "step": 718 - }, - { - "epoch": 4.411042944785276, - "grad_norm": 4.192755222320557, - "learning_rate": 2.9646662937978082e-06, - "loss": 0.1897, - "step": 719 - }, - { - "epoch": 4.41717791411043, - "grad_norm": 2.9729068279266357, - "learning_rate": 2.9599310005311824e-06, - "loss": 0.0457, - "step": 720 - }, - { - "epoch": 4.423312883435583, - "grad_norm": 4.234438896179199, - "learning_rate": 2.9551939987562866e-06, - "loss": 0.2307, - "step": 721 - }, - { - "epoch": 4.429447852760736, - "grad_norm": 3.3982434272766113, - "learning_rate": 2.950455306069688e-06, - "loss": 0.0637, - "step": 722 - }, - { - "epoch": 4.435582822085889, - "grad_norm": 4.539764404296875, - "learning_rate": 2.9457149400742357e-06, - "loss": 0.1924, - "step": 723 - }, - { - "epoch": 4.441717791411043, - "grad_norm": 4.039684772491455, - "learning_rate": 2.940972918378993e-06, - "loss": 0.1275, - "step": 724 - }, - { - "epoch": 4.447852760736196, - "grad_norm": 4.340360641479492, - "learning_rate": 2.936229258599174e-06, - "loss": 0.123, - "step": 725 - }, - { - "epoch": 4.45398773006135, - "grad_norm": 2.8720109462738037, - "learning_rate": 2.93148397835608e-06, - "loss": 0.0555, - "step": 726 - }, - { - "epoch": 4.460122699386503, - "grad_norm": 4.227811336517334, - "learning_rate": 2.926737095277029e-06, - "loss": 0.0991, - "step": 727 - }, - { - "epoch": 4.466257668711656, - "grad_norm": 2.8079142570495605, - "learning_rate": 2.921988626995295e-06, - "loss": 0.0628, - "step": 728 - }, - { - "epoch": 4.47239263803681, - "grad_norm": 4.195122241973877, - "learning_rate": 2.9172385911500385e-06, - "loss": 0.2333, - "step": 729 - }, - { - "epoch": 4.478527607361963, - "grad_norm": 3.223794460296631, - "learning_rate": 2.9124870053862447e-06, - "loss": 0.1317, - "step": 730 - }, - { - "epoch": 4.484662576687117, - "grad_norm": 3.5533759593963623, - "learning_rate": 2.907733887354657e-06, - "loss": 0.2285, - "step": 731 - }, - { - "epoch": 4.49079754601227, - "grad_norm": 3.535673141479492, - "learning_rate": 2.9029792547117088e-06, - "loss": 0.096, - "step": 732 - }, - { - "epoch": 4.4969325153374236, - "grad_norm": 4.031703948974609, - "learning_rate": 2.898223125119461e-06, - "loss": 0.1505, - "step": 733 - }, - { - "epoch": 4.5030674846625764, - "grad_norm": 2.823413610458374, - "learning_rate": 2.893465516245534e-06, - "loss": 0.0327, - "step": 734 - }, - { - "epoch": 4.50920245398773, - "grad_norm": 3.516738176345825, - "learning_rate": 2.8887064457630453e-06, - "loss": 0.0743, - "step": 735 - }, - { - "epoch": 4.515337423312883, - "grad_norm": 3.5523500442504883, - "learning_rate": 2.8839459313505407e-06, - "loss": 0.1768, - "step": 736 - }, - { - "epoch": 4.521472392638037, - "grad_norm": 3.2433223724365234, - "learning_rate": 2.879183990691929e-06, - "loss": 0.1598, - "step": 737 - }, - { - "epoch": 4.52760736196319, - "grad_norm": 3.0156848430633545, - "learning_rate": 2.8744206414764185e-06, - "loss": 0.0829, - "step": 738 - }, - { - "epoch": 4.533742331288344, - "grad_norm": 4.359529495239258, - "learning_rate": 2.8696559013984488e-06, - "loss": 0.1169, - "step": 739 - }, - { - "epoch": 4.539877300613497, - "grad_norm": 2.3862433433532715, - "learning_rate": 2.8648897881576274e-06, - "loss": 0.0962, - "step": 740 - }, - { - "epoch": 4.54601226993865, - "grad_norm": 2.7100136280059814, - "learning_rate": 2.8601223194586613e-06, - "loss": 0.1204, - "step": 741 - }, - { - "epoch": 4.552147239263804, - "grad_norm": 3.8116140365600586, - "learning_rate": 2.8553535130112935e-06, - "loss": 0.0685, - "step": 742 - }, - { - "epoch": 4.558282208588957, - "grad_norm": 2.9640142917633057, - "learning_rate": 2.850583386530235e-06, - "loss": 0.0692, - "step": 743 - }, - { - "epoch": 4.564417177914111, - "grad_norm": 3.264592170715332, - "learning_rate": 2.8458119577351035e-06, - "loss": 0.2128, - "step": 744 - }, - { - "epoch": 4.570552147239264, - "grad_norm": 3.230497360229492, - "learning_rate": 2.841039244350351e-06, - "loss": 0.2409, - "step": 745 - }, - { - "epoch": 4.576687116564417, - "grad_norm": 4.41513204574585, - "learning_rate": 2.8362652641052024e-06, - "loss": 0.1878, - "step": 746 - }, - { - "epoch": 4.58282208588957, - "grad_norm": 3.047248601913452, - "learning_rate": 2.83149003473359e-06, - "loss": 0.1303, - "step": 747 - }, - { - "epoch": 4.588957055214724, - "grad_norm": 2.399754047393799, - "learning_rate": 2.8267135739740836e-06, - "loss": 0.0577, - "step": 748 - }, - { - "epoch": 4.595092024539877, - "grad_norm": 4.608038425445557, - "learning_rate": 2.8219358995698307e-06, - "loss": 0.2329, - "step": 749 - }, - { - "epoch": 4.601226993865031, - "grad_norm": 3.537644147872925, - "learning_rate": 2.8171570292684846e-06, - "loss": 0.1329, - "step": 750 - }, - { - "epoch": 4.6073619631901845, - "grad_norm": 2.8099827766418457, - "learning_rate": 2.8123769808221407e-06, - "loss": 0.1512, - "step": 751 - }, - { - "epoch": 4.613496932515337, - "grad_norm": 3.3169758319854736, - "learning_rate": 2.8075957719872724e-06, - "loss": 0.1267, - "step": 752 - }, - { - "epoch": 4.61963190184049, - "grad_norm": 3.578435182571411, - "learning_rate": 2.8028134205246633e-06, - "loss": 0.147, - "step": 753 - }, - { - "epoch": 4.625766871165644, - "grad_norm": 3.544437885284424, - "learning_rate": 2.7980299441993415e-06, - "loss": 0.0947, - "step": 754 - }, - { - "epoch": 4.631901840490798, - "grad_norm": 3.798776388168335, - "learning_rate": 2.793245360780512e-06, - "loss": 0.1498, - "step": 755 - }, - { - "epoch": 4.638036809815951, - "grad_norm": 3.634991407394409, - "learning_rate": 2.788459688041495e-06, - "loss": 0.2504, - "step": 756 - }, - { - "epoch": 4.644171779141105, - "grad_norm": 20.123680114746094, - "learning_rate": 2.783672943759655e-06, - "loss": 0.2091, - "step": 757 - }, - { - "epoch": 4.6503067484662575, - "grad_norm": 3.9357221126556396, - "learning_rate": 2.778885145716339e-06, - "loss": 0.2045, - "step": 758 - }, - { - "epoch": 4.656441717791411, - "grad_norm": 3.3035309314727783, - "learning_rate": 2.7740963116968063e-06, - "loss": 0.1416, - "step": 759 - }, - { - "epoch": 4.662576687116564, - "grad_norm": 3.096985101699829, - "learning_rate": 2.7693064594901646e-06, - "loss": 0.0455, - "step": 760 - }, - { - "epoch": 4.668711656441718, - "grad_norm": 2.9855458736419678, - "learning_rate": 2.7645156068893075e-06, - "loss": 0.1496, - "step": 761 - }, - { - "epoch": 4.674846625766871, - "grad_norm": 3.9140093326568604, - "learning_rate": 2.759723771690839e-06, - "loss": 0.2061, - "step": 762 - }, - { - "epoch": 4.680981595092025, - "grad_norm": 3.590569496154785, - "learning_rate": 2.754930971695019e-06, - "loss": 0.1017, - "step": 763 - }, - { - "epoch": 4.6871165644171775, - "grad_norm": 3.527254581451416, - "learning_rate": 2.750137224705687e-06, - "loss": 0.1979, - "step": 764 - }, - { - "epoch": 4.693251533742331, - "grad_norm": 4.198459148406982, - "learning_rate": 2.745342548530202e-06, - "loss": 0.1667, - "step": 765 - }, - { - "epoch": 4.699386503067485, - "grad_norm": 2.0246167182922363, - "learning_rate": 2.7405469609793746e-06, - "loss": 0.0346, - "step": 766 - }, - { - "epoch": 4.705521472392638, - "grad_norm": 3.2045300006866455, - "learning_rate": 2.7357504798674004e-06, - "loss": 0.0596, - "step": 767 - }, - { - "epoch": 4.711656441717792, - "grad_norm": 2.736985921859741, - "learning_rate": 2.730953123011796e-06, - "loss": 0.0384, - "step": 768 - }, - { - "epoch": 4.717791411042945, - "grad_norm": 3.0621395111083984, - "learning_rate": 2.726154908233328e-06, - "loss": 0.0558, - "step": 769 - }, - { - "epoch": 4.723926380368098, - "grad_norm": 3.2280497550964355, - "learning_rate": 2.721355853355953e-06, - "loss": 0.2272, - "step": 770 - }, - { - "epoch": 4.730061349693251, - "grad_norm": 3.342226028442383, - "learning_rate": 2.716555976206748e-06, - "loss": 0.074, - "step": 771 - }, - { - "epoch": 4.736196319018405, - "grad_norm": 4.328624248504639, - "learning_rate": 2.7117552946158415e-06, - "loss": 0.1034, - "step": 772 - }, - { - "epoch": 4.742331288343558, - "grad_norm": 2.980215311050415, - "learning_rate": 2.706953826416353e-06, - "loss": 0.1199, - "step": 773 - }, - { - "epoch": 4.748466257668712, - "grad_norm": 2.622478485107422, - "learning_rate": 2.702151589444324e-06, - "loss": 0.0467, - "step": 774 - }, - { - "epoch": 4.754601226993865, - "grad_norm": 2.9958693981170654, - "learning_rate": 2.6973486015386507e-06, - "loss": 0.143, - "step": 775 - }, - { - "epoch": 4.7607361963190185, - "grad_norm": 4.548511505126953, - "learning_rate": 2.6925448805410197e-06, - "loss": 0.3594, - "step": 776 - }, - { - "epoch": 4.766871165644172, - "grad_norm": 3.3429481983184814, - "learning_rate": 2.6877404442958393e-06, - "loss": 0.1397, - "step": 777 - }, - { - "epoch": 4.773006134969325, - "grad_norm": 2.5820136070251465, - "learning_rate": 2.682935310650177e-06, - "loss": 0.054, - "step": 778 - }, - { - "epoch": 4.779141104294479, - "grad_norm": 4.047626495361328, - "learning_rate": 2.6781294974536886e-06, - "loss": 0.1284, - "step": 779 - }, - { - "epoch": 4.785276073619632, - "grad_norm": 3.0227510929107666, - "learning_rate": 2.673323022558557e-06, - "loss": 0.1441, - "step": 780 - }, - { - "epoch": 4.791411042944786, - "grad_norm": 4.731313705444336, - "learning_rate": 2.6685159038194202e-06, - "loss": 0.2859, - "step": 781 - }, - { - "epoch": 4.7975460122699385, - "grad_norm": 3.880655288696289, - "learning_rate": 2.6637081590933096e-06, - "loss": 0.1524, - "step": 782 - }, - { - "epoch": 4.803680981595092, - "grad_norm": 2.375474452972412, - "learning_rate": 2.6588998062395803e-06, - "loss": 0.0338, - "step": 783 - }, - { - "epoch": 4.809815950920245, - "grad_norm": 3.3587446212768555, - "learning_rate": 2.6540908631198498e-06, - "loss": 0.0755, - "step": 784 - }, - { - "epoch": 4.815950920245399, - "grad_norm": 2.767686367034912, - "learning_rate": 2.6492813475979243e-06, - "loss": 0.0631, - "step": 785 - }, - { - "epoch": 4.822085889570552, - "grad_norm": 3.88670015335083, - "learning_rate": 2.6444712775397397e-06, - "loss": 0.0853, - "step": 786 - }, - { - "epoch": 4.828220858895706, - "grad_norm": 3.543276309967041, - "learning_rate": 2.639660670813288e-06, - "loss": 0.1895, - "step": 787 - }, - { - "epoch": 4.8343558282208585, - "grad_norm": 3.659323215484619, - "learning_rate": 2.6348495452885598e-06, - "loss": 0.1745, - "step": 788 - }, - { - "epoch": 4.840490797546012, - "grad_norm": 3.0955021381378174, - "learning_rate": 2.630037918837468e-06, - "loss": 0.0846, - "step": 789 - }, - { - "epoch": 4.846625766871165, - "grad_norm": 3.4473249912261963, - "learning_rate": 2.6252258093337892e-06, - "loss": 0.0808, - "step": 790 - }, - { - "epoch": 4.852760736196319, - "grad_norm": 3.937120199203491, - "learning_rate": 2.6204132346530936e-06, - "loss": 0.2054, - "step": 791 - }, - { - "epoch": 4.858895705521473, - "grad_norm": 4.052806854248047, - "learning_rate": 2.6156002126726788e-06, - "loss": 0.1679, - "step": 792 - }, - { - "epoch": 4.865030674846626, - "grad_norm": 2.6694889068603516, - "learning_rate": 2.6107867612715043e-06, - "loss": 0.0534, - "step": 793 - }, - { - "epoch": 4.871165644171779, - "grad_norm": 3.594649076461792, - "learning_rate": 2.6059728983301267e-06, - "loss": 0.0899, - "step": 794 - }, - { - "epoch": 4.877300613496932, - "grad_norm": 2.7796030044555664, - "learning_rate": 2.601158641730629e-06, - "loss": 0.0596, - "step": 795 - }, - { - "epoch": 4.883435582822086, - "grad_norm": 4.618961334228516, - "learning_rate": 2.5963440093565567e-06, - "loss": 0.3858, - "step": 796 - }, - { - "epoch": 4.889570552147239, - "grad_norm": 3.0783939361572266, - "learning_rate": 2.5915290190928518e-06, - "loss": 0.12, - "step": 797 - }, - { - "epoch": 4.895705521472393, - "grad_norm": 4.078456878662109, - "learning_rate": 2.586713688825786e-06, - "loss": 0.1278, - "step": 798 - }, - { - "epoch": 4.901840490797546, - "grad_norm": 2.9439120292663574, - "learning_rate": 2.5818980364428935e-06, - "loss": 0.0847, - "step": 799 - }, - { - "epoch": 4.9079754601226995, - "grad_norm": 5.140681743621826, - "learning_rate": 2.5770820798329055e-06, - "loss": 0.1718, - "step": 800 - }, - { - "epoch": 4.914110429447852, - "grad_norm": 3.450190305709839, - "learning_rate": 2.572265836885682e-06, - "loss": 0.0895, - "step": 801 - }, - { - "epoch": 4.920245398773006, - "grad_norm": 3.1145224571228027, - "learning_rate": 2.567449325492149e-06, - "loss": 0.0652, - "step": 802 - }, - { - "epoch": 4.92638036809816, - "grad_norm": 2.851768732070923, - "learning_rate": 2.5626325635442283e-06, - "loss": 0.0877, - "step": 803 - }, - { - "epoch": 4.932515337423313, - "grad_norm": 3.3392980098724365, - "learning_rate": 2.5578155689347716e-06, - "loss": 0.2028, - "step": 804 - }, - { - "epoch": 4.938650306748467, - "grad_norm": 3.012439250946045, - "learning_rate": 2.5529983595574964e-06, - "loss": 0.031, - "step": 805 - }, - { - "epoch": 4.9447852760736195, - "grad_norm": 2.7732717990875244, - "learning_rate": 2.548180953306918e-06, - "loss": 0.0415, - "step": 806 - }, - { - "epoch": 4.950920245398773, - "grad_norm": 3.0423903465270996, - "learning_rate": 2.5433633680782817e-06, - "loss": 0.1188, - "step": 807 - }, - { - "epoch": 4.957055214723926, - "grad_norm": 5.056387901306152, - "learning_rate": 2.538545621767498e-06, - "loss": 0.1703, - "step": 808 - }, - { - "epoch": 4.96319018404908, - "grad_norm": 4.052585124969482, - "learning_rate": 2.533727732271077e-06, - "loss": 0.1455, - "step": 809 - }, - { - "epoch": 4.969325153374233, - "grad_norm": 3.4507904052734375, - "learning_rate": 2.5289097174860593e-06, - "loss": 0.0617, - "step": 810 - }, - { - "epoch": 4.975460122699387, - "grad_norm": 2.908266305923462, - "learning_rate": 2.524091595309952e-06, - "loss": 0.1173, - "step": 811 - }, - { - "epoch": 4.9815950920245395, - "grad_norm": 2.5857458114624023, - "learning_rate": 2.519273383640661e-06, - "loss": 0.0538, - "step": 812 - }, - { - "epoch": 4.987730061349693, - "grad_norm": 3.3518428802490234, - "learning_rate": 2.5144551003764227e-06, - "loss": 0.211, - "step": 813 - }, - { - "epoch": 4.993865030674847, - "grad_norm": 3.137981653213501, - "learning_rate": 2.509636763415742e-06, - "loss": 0.0944, - "step": 814 - }, - { - "epoch": 5.0, - "grad_norm": 2.8854241371154785, - "learning_rate": 2.5048183906573227e-06, - "loss": 0.098, - "step": 815 - }, - { - "epoch": 5.006134969325154, - "grad_norm": 3.508527994155884, - "learning_rate": 2.5e-06, - "loss": 0.1102, - "step": 816 - }, - { - "epoch": 5.012269938650307, - "grad_norm": 2.448152542114258, - "learning_rate": 2.495181609342678e-06, - "loss": 0.0712, - "step": 817 - }, - { - "epoch": 5.0184049079754605, - "grad_norm": 3.105818748474121, - "learning_rate": 2.4903632365842587e-06, - "loss": 0.0414, - "step": 818 - }, - { - "epoch": 5.024539877300613, - "grad_norm": 3.8048601150512695, - "learning_rate": 2.4855448996235777e-06, - "loss": 0.0894, - "step": 819 - }, - { - "epoch": 5.030674846625767, - "grad_norm": 3.259834051132202, - "learning_rate": 2.48072661635934e-06, - "loss": 0.0796, - "step": 820 - }, - { - "epoch": 5.03680981595092, - "grad_norm": 2.822364568710327, - "learning_rate": 2.475908404690049e-06, - "loss": 0.0349, - "step": 821 - }, - { - "epoch": 5.042944785276074, - "grad_norm": 4.78808069229126, - "learning_rate": 2.4710902825139415e-06, - "loss": 0.2529, - "step": 822 - }, - { - "epoch": 5.049079754601227, - "grad_norm": 3.5420572757720947, - "learning_rate": 2.466272267728924e-06, - "loss": 0.1405, - "step": 823 - }, - { - "epoch": 5.0552147239263805, - "grad_norm": 2.500713348388672, - "learning_rate": 2.461454378232503e-06, - "loss": 0.0408, - "step": 824 - }, - { - "epoch": 5.061349693251533, - "grad_norm": 3.266291618347168, - "learning_rate": 2.4566366319217196e-06, - "loss": 0.0338, - "step": 825 - }, - { - "epoch": 5.067484662576687, - "grad_norm": 4.071012020111084, - "learning_rate": 2.4518190466930837e-06, - "loss": 0.06, - "step": 826 - }, - { - "epoch": 5.07361963190184, - "grad_norm": 4.3747172355651855, - "learning_rate": 2.4470016404425045e-06, - "loss": 0.1184, - "step": 827 - }, - { - "epoch": 5.079754601226994, - "grad_norm": 3.92030668258667, - "learning_rate": 2.4421844310652296e-06, - "loss": 0.1369, - "step": 828 - }, - { - "epoch": 5.085889570552148, - "grad_norm": 3.3482303619384766, - "learning_rate": 2.437367436455773e-06, - "loss": 0.1166, - "step": 829 - }, - { - "epoch": 5.0920245398773005, - "grad_norm": 3.429368019104004, - "learning_rate": 2.4325506745078524e-06, - "loss": 0.1214, - "step": 830 - }, - { - "epoch": 5.098159509202454, - "grad_norm": 3.4915647506713867, - "learning_rate": 2.427734163114319e-06, - "loss": 0.0454, - "step": 831 - }, - { - "epoch": 5.104294478527607, - "grad_norm": 3.1721251010894775, - "learning_rate": 2.4229179201670954e-06, - "loss": 0.0431, - "step": 832 - }, - { - "epoch": 5.110429447852761, - "grad_norm": 2.552578926086426, - "learning_rate": 2.418101963557107e-06, - "loss": 0.0347, - "step": 833 - }, - { - "epoch": 5.116564417177914, - "grad_norm": 3.518169403076172, - "learning_rate": 2.413286311174214e-06, - "loss": 0.1555, - "step": 834 - }, - { - "epoch": 5.122699386503068, - "grad_norm": 2.4452908039093018, - "learning_rate": 2.4084709809071487e-06, - "loss": 0.035, - "step": 835 - }, - { - "epoch": 5.128834355828221, - "grad_norm": 3.5366528034210205, - "learning_rate": 2.403655990643444e-06, - "loss": 0.0798, - "step": 836 - }, - { - "epoch": 5.134969325153374, - "grad_norm": 2.300065040588379, - "learning_rate": 2.398841358269371e-06, - "loss": 0.0178, - "step": 837 - }, - { - "epoch": 5.141104294478527, - "grad_norm": 2.851393699645996, - "learning_rate": 2.3940271016698733e-06, - "loss": 0.0447, - "step": 838 - }, - { - "epoch": 5.147239263803681, - "grad_norm": 4.085958957672119, - "learning_rate": 2.3892132387284956e-06, - "loss": 0.1626, - "step": 839 - }, - { - "epoch": 5.153374233128835, - "grad_norm": 3.4240522384643555, - "learning_rate": 2.384399787327322e-06, - "loss": 0.0914, - "step": 840 - }, - { - "epoch": 5.159509202453988, - "grad_norm": 4.111586570739746, - "learning_rate": 2.3795867653469072e-06, - "loss": 0.0784, - "step": 841 - }, - { - "epoch": 5.1656441717791415, - "grad_norm": 2.3306312561035156, - "learning_rate": 2.374774190666211e-06, - "loss": 0.0216, - "step": 842 - }, - { - "epoch": 5.171779141104294, - "grad_norm": 2.5006275177001953, - "learning_rate": 2.3699620811625327e-06, - "loss": 0.0516, - "step": 843 - }, - { - "epoch": 5.177914110429448, - "grad_norm": 3.1680967807769775, - "learning_rate": 2.365150454711441e-06, - "loss": 0.0517, - "step": 844 - }, - { - "epoch": 5.184049079754601, - "grad_norm": 1.817044734954834, - "learning_rate": 2.3603393291867122e-06, - "loss": 0.0264, - "step": 845 - }, - { - "epoch": 5.190184049079755, - "grad_norm": 4.445211887359619, - "learning_rate": 2.355528722460261e-06, - "loss": 0.1079, - "step": 846 - }, - { - "epoch": 5.196319018404908, - "grad_norm": 2.918304681777954, - "learning_rate": 2.350718652402076e-06, - "loss": 0.0633, - "step": 847 - }, - { - "epoch": 5.2024539877300615, - "grad_norm": 3.6307432651519775, - "learning_rate": 2.345909136880151e-06, - "loss": 0.1013, - "step": 848 - }, - { - "epoch": 5.208588957055214, - "grad_norm": 3.5696842670440674, - "learning_rate": 2.34110019376042e-06, - "loss": 0.0199, - "step": 849 - }, - { - "epoch": 5.214723926380368, - "grad_norm": 2.2214856147766113, - "learning_rate": 2.336291840906691e-06, - "loss": 0.0288, - "step": 850 - }, - { - "epoch": 5.220858895705521, - "grad_norm": 2.5375778675079346, - "learning_rate": 2.3314840961805806e-06, - "loss": 0.0142, - "step": 851 - }, - { - "epoch": 5.226993865030675, - "grad_norm": 3.0093517303466797, - "learning_rate": 2.326676977441444e-06, - "loss": 0.0911, - "step": 852 - }, - { - "epoch": 5.233128834355828, - "grad_norm": 2.7067151069641113, - "learning_rate": 2.3218705025463118e-06, - "loss": 0.0315, - "step": 853 - }, - { - "epoch": 5.2392638036809815, - "grad_norm": 3.1892940998077393, - "learning_rate": 2.3170646893498237e-06, - "loss": 0.1344, - "step": 854 - }, - { - "epoch": 5.245398773006135, - "grad_norm": 2.8909313678741455, - "learning_rate": 2.312259555704161e-06, - "loss": 0.034, - "step": 855 - }, - { - "epoch": 5.251533742331288, - "grad_norm": 5.097650051116943, - "learning_rate": 2.3074551194589816e-06, - "loss": 0.1889, - "step": 856 - }, - { - "epoch": 5.257668711656442, - "grad_norm": 3.8511006832122803, - "learning_rate": 2.3026513984613506e-06, - "loss": 0.0794, - "step": 857 - }, - { - "epoch": 5.263803680981595, - "grad_norm": 2.2874133586883545, - "learning_rate": 2.297848410555677e-06, - "loss": 0.0238, - "step": 858 - }, - { - "epoch": 5.269938650306749, - "grad_norm": 3.504723310470581, - "learning_rate": 2.293046173583648e-06, - "loss": 0.0369, - "step": 859 - }, - { - "epoch": 5.276073619631902, - "grad_norm": 3.2108154296875, - "learning_rate": 2.28824470538416e-06, - "loss": 0.0677, - "step": 860 - }, - { - "epoch": 5.282208588957055, - "grad_norm": 2.2249386310577393, - "learning_rate": 2.2834440237932537e-06, - "loss": 0.0244, - "step": 861 - }, - { - "epoch": 5.288343558282208, - "grad_norm": 3.141784191131592, - "learning_rate": 2.2786441466440474e-06, - "loss": 0.0628, - "step": 862 - }, - { - "epoch": 5.294478527607362, - "grad_norm": 3.5597352981567383, - "learning_rate": 2.2738450917666727e-06, - "loss": 0.0914, - "step": 863 - }, - { - "epoch": 5.300613496932515, - "grad_norm": 2.991966962814331, - "learning_rate": 2.269046876988204e-06, - "loss": 0.0546, - "step": 864 - }, - { - "epoch": 5.306748466257669, - "grad_norm": 3.100776195526123, - "learning_rate": 2.2642495201325995e-06, - "loss": 0.0473, - "step": 865 - }, - { - "epoch": 5.3128834355828225, - "grad_norm": 2.541754722595215, - "learning_rate": 2.259453039020626e-06, - "loss": 0.0613, - "step": 866 - }, - { - "epoch": 5.319018404907975, - "grad_norm": 2.8117194175720215, - "learning_rate": 2.2546574514697985e-06, - "loss": 0.0533, - "step": 867 - }, - { - "epoch": 5.325153374233129, - "grad_norm": 2.5676379203796387, - "learning_rate": 2.249862775294313e-06, - "loss": 0.018, - "step": 868 - }, - { - "epoch": 5.331288343558282, - "grad_norm": 2.5297701358795166, - "learning_rate": 2.245069028304981e-06, - "loss": 0.0246, - "step": 869 - }, - { - "epoch": 5.337423312883436, - "grad_norm": 2.199498176574707, - "learning_rate": 2.240276228309161e-06, - "loss": 0.0551, - "step": 870 - }, - { - "epoch": 5.343558282208589, - "grad_norm": 2.5793557167053223, - "learning_rate": 2.2354843931106933e-06, - "loss": 0.0258, - "step": 871 - }, - { - "epoch": 5.3496932515337425, - "grad_norm": 3.352058172225952, - "learning_rate": 2.230693540509836e-06, - "loss": 0.0228, - "step": 872 - }, - { - "epoch": 5.355828220858895, - "grad_norm": 2.900599956512451, - "learning_rate": 2.225903688303195e-06, - "loss": 0.0586, - "step": 873 - }, - { - "epoch": 5.361963190184049, - "grad_norm": 3.3317267894744873, - "learning_rate": 2.221114854283662e-06, - "loss": 0.0733, - "step": 874 - }, - { - "epoch": 5.368098159509202, - "grad_norm": 2.79304575920105, - "learning_rate": 2.2163270562403453e-06, - "loss": 0.0251, - "step": 875 - }, - { - "epoch": 5.374233128834356, - "grad_norm": 3.8596227169036865, - "learning_rate": 2.211540311958506e-06, - "loss": 0.0957, - "step": 876 - }, - { - "epoch": 5.38036809815951, - "grad_norm": 2.7464358806610107, - "learning_rate": 2.2067546392194888e-06, - "loss": 0.0457, - "step": 877 - }, - { - "epoch": 5.386503067484663, - "grad_norm": 2.3359906673431396, - "learning_rate": 2.2019700558006598e-06, - "loss": 0.0218, - "step": 878 - }, - { - "epoch": 5.392638036809816, - "grad_norm": 3.2412452697753906, - "learning_rate": 2.197186579475337e-06, - "loss": 0.0494, - "step": 879 - }, - { - "epoch": 5.398773006134969, - "grad_norm": 3.930197238922119, - "learning_rate": 2.1924042280127284e-06, - "loss": 0.0803, - "step": 880 - }, - { - "epoch": 5.404907975460123, - "grad_norm": 2.5752930641174316, - "learning_rate": 2.1876230191778598e-06, - "loss": 0.0356, - "step": 881 - }, - { - "epoch": 5.411042944785276, - "grad_norm": 5.507393836975098, - "learning_rate": 2.182842970731516e-06, - "loss": 0.1245, - "step": 882 - }, - { - "epoch": 5.41717791411043, - "grad_norm": 2.416719436645508, - "learning_rate": 2.17806410043017e-06, - "loss": 0.0224, - "step": 883 - }, - { - "epoch": 5.423312883435583, - "grad_norm": 2.500429630279541, - "learning_rate": 2.173286426025917e-06, - "loss": 0.0499, - "step": 884 - }, - { - "epoch": 5.429447852760736, - "grad_norm": 2.8843860626220703, - "learning_rate": 2.168509965266411e-06, - "loss": 0.075, - "step": 885 - }, - { - "epoch": 5.435582822085889, - "grad_norm": 2.3187198638916016, - "learning_rate": 2.1637347358947984e-06, - "loss": 0.065, - "step": 886 - }, - { - "epoch": 5.441717791411043, - "grad_norm": 2.7135889530181885, - "learning_rate": 2.15896075564965e-06, - "loss": 0.0848, - "step": 887 - }, - { - "epoch": 5.447852760736196, - "grad_norm": 1.751846194267273, - "learning_rate": 2.1541880422648978e-06, - "loss": 0.0112, - "step": 888 - }, - { - "epoch": 5.45398773006135, - "grad_norm": 3.113271713256836, - "learning_rate": 2.1494166134697655e-06, - "loss": 0.077, - "step": 889 - }, - { - "epoch": 5.460122699386503, - "grad_norm": 2.711318016052246, - "learning_rate": 2.1446464869887077e-06, - "loss": 0.03, - "step": 890 - }, - { - "epoch": 5.466257668711656, - "grad_norm": 1.8012003898620605, - "learning_rate": 2.13987768054134e-06, - "loss": 0.0141, - "step": 891 - }, - { - "epoch": 5.47239263803681, - "grad_norm": 2.0968120098114014, - "learning_rate": 2.135110211842374e-06, - "loss": 0.0147, - "step": 892 - }, - { - "epoch": 5.478527607361963, - "grad_norm": 3.1689956188201904, - "learning_rate": 2.1303440986015525e-06, - "loss": 0.1123, - "step": 893 - }, - { - "epoch": 5.484662576687117, - "grad_norm": 4.512697219848633, - "learning_rate": 2.1255793585235827e-06, - "loss": 0.0359, - "step": 894 - }, - { - "epoch": 5.49079754601227, - "grad_norm": 3.5739688873291016, - "learning_rate": 2.120816009308071e-06, - "loss": 0.0635, - "step": 895 - }, - { - "epoch": 5.4969325153374236, - "grad_norm": 4.556554317474365, - "learning_rate": 2.1160540686494597e-06, - "loss": 0.1104, - "step": 896 - }, - { - "epoch": 5.5030674846625764, - "grad_norm": 2.2047064304351807, - "learning_rate": 2.1112935542369546e-06, - "loss": 0.0187, - "step": 897 - }, - { - "epoch": 5.50920245398773, - "grad_norm": 3.0289857387542725, - "learning_rate": 2.106534483754466e-06, - "loss": 0.0874, - "step": 898 - }, - { - "epoch": 5.515337423312883, - "grad_norm": 2.7090444564819336, - "learning_rate": 2.1017768748805396e-06, - "loss": 0.0301, - "step": 899 - }, - { - "epoch": 5.521472392638037, - "grad_norm": 3.0662643909454346, - "learning_rate": 2.0970207452882917e-06, - "loss": 0.1192, - "step": 900 - }, - { - "epoch": 5.52760736196319, - "grad_norm": 2.869401454925537, - "learning_rate": 2.0922661126453436e-06, - "loss": 0.0803, - "step": 901 - }, - { - "epoch": 5.533742331288344, - "grad_norm": 2.229947328567505, - "learning_rate": 2.0875129946137557e-06, - "loss": 0.0186, - "step": 902 - }, - { - "epoch": 5.539877300613497, - "grad_norm": 3.3460421562194824, - "learning_rate": 2.0827614088499624e-06, - "loss": 0.0499, - "step": 903 - }, - { - "epoch": 5.54601226993865, - "grad_norm": 1.9324007034301758, - "learning_rate": 2.0780113730047056e-06, - "loss": 0.0322, - "step": 904 - }, - { - "epoch": 5.552147239263804, - "grad_norm": 2.761482000350952, - "learning_rate": 2.0732629047229712e-06, - "loss": 0.0265, - "step": 905 - }, - { - "epoch": 5.558282208588957, - "grad_norm": 2.4173266887664795, - "learning_rate": 2.0685160216439205e-06, - "loss": 0.0229, - "step": 906 - }, - { - "epoch": 5.564417177914111, - "grad_norm": 2.503661632537842, - "learning_rate": 2.0637707414008267e-06, - "loss": 0.0266, - "step": 907 - }, - { - "epoch": 5.570552147239264, - "grad_norm": 2.312236785888672, - "learning_rate": 2.0590270816210077e-06, - "loss": 0.018, - "step": 908 - }, - { - "epoch": 5.576687116564417, - "grad_norm": 2.569575548171997, - "learning_rate": 2.0542850599257647e-06, - "loss": 0.0377, - "step": 909 - }, - { - "epoch": 5.58282208588957, - "grad_norm": 3.520341157913208, - "learning_rate": 2.0495446939303122e-06, - "loss": 0.1224, - "step": 910 - }, - { - "epoch": 5.588957055214724, - "grad_norm": 3.231363296508789, - "learning_rate": 2.044806001243714e-06, - "loss": 0.1457, - "step": 911 - }, - { - "epoch": 5.595092024539877, - "grad_norm": 3.3211300373077393, - "learning_rate": 2.040068999468818e-06, - "loss": 0.0429, - "step": 912 - }, - { - "epoch": 5.601226993865031, - "grad_norm": 3.3712961673736572, - "learning_rate": 2.035333706202192e-06, - "loss": 0.0634, - "step": 913 - }, - { - "epoch": 5.6073619631901845, - "grad_norm": 2.480177402496338, - "learning_rate": 2.0306001390340565e-06, - "loss": 0.0178, - "step": 914 - }, - { - "epoch": 5.613496932515337, - "grad_norm": 2.9777421951293945, - "learning_rate": 2.02586831554822e-06, - "loss": 0.037, - "step": 915 - }, - { - "epoch": 5.61963190184049, - "grad_norm": 2.9129085540771484, - "learning_rate": 2.021138253322012e-06, - "loss": 0.125, - "step": 916 - }, - { - "epoch": 5.625766871165644, - "grad_norm": 4.041767597198486, - "learning_rate": 2.016409969926224e-06, - "loss": 0.1897, - "step": 917 - }, - { - "epoch": 5.631901840490798, - "grad_norm": 4.088902950286865, - "learning_rate": 2.0116834829250355e-06, - "loss": 0.0546, - "step": 918 - }, - { - "epoch": 5.638036809815951, - "grad_norm": 3.8629167079925537, - "learning_rate": 2.0069588098759545e-06, - "loss": 0.0911, - "step": 919 - }, - { - "epoch": 5.644171779141105, - "grad_norm": 2.616830825805664, - "learning_rate": 2.00223596832975e-06, - "loss": 0.0527, - "step": 920 - }, - { - "epoch": 5.6503067484662575, - "grad_norm": 1.9370782375335693, - "learning_rate": 1.9975149758303885e-06, - "loss": 0.0384, - "step": 921 - }, - { - "epoch": 5.656441717791411, - "grad_norm": 3.7839455604553223, - "learning_rate": 1.992795849914967e-06, - "loss": 0.1033, - "step": 922 - }, - { - "epoch": 5.662576687116564, - "grad_norm": 3.870729923248291, - "learning_rate": 1.9880786081136498e-06, - "loss": 0.08, - "step": 923 - }, - { - "epoch": 5.668711656441718, - "grad_norm": 3.4394288063049316, - "learning_rate": 1.9833632679496008e-06, - "loss": 0.0819, - "step": 924 - }, - { - "epoch": 5.674846625766871, - "grad_norm": 3.1659159660339355, - "learning_rate": 1.97864984693892e-06, - "loss": 0.117, - "step": 925 - }, - { - "epoch": 5.680981595092025, - "grad_norm": 2.2375190258026123, - "learning_rate": 1.97393836259058e-06, - "loss": 0.0215, - "step": 926 - }, - { - "epoch": 5.6871165644171775, - "grad_norm": 3.9375314712524414, - "learning_rate": 1.969228832406358e-06, - "loss": 0.1422, - "step": 927 - }, - { - "epoch": 5.693251533742331, - "grad_norm": 3.1969058513641357, - "learning_rate": 1.964521273880772e-06, - "loss": 0.0538, - "step": 928 - }, - { - "epoch": 5.699386503067485, - "grad_norm": 3.5990066528320312, - "learning_rate": 1.9598157045010162e-06, - "loss": 0.114, - "step": 929 - }, - { - "epoch": 5.705521472392638, - "grad_norm": 3.1764235496520996, - "learning_rate": 1.9551121417468955e-06, - "loss": 0.053, - "step": 930 - }, - { - "epoch": 5.711656441717792, - "grad_norm": 4.1162309646606445, - "learning_rate": 1.9504106030907605e-06, - "loss": 0.0866, - "step": 931 - }, - { - "epoch": 5.717791411042945, - "grad_norm": 3.543071985244751, - "learning_rate": 1.945711105997444e-06, - "loss": 0.0908, - "step": 932 - }, - { - "epoch": 5.723926380368098, - "grad_norm": 4.136870384216309, - "learning_rate": 1.941013667924194e-06, - "loss": 0.0612, - "step": 933 - }, - { - "epoch": 5.730061349693251, - "grad_norm": 1.7658357620239258, - "learning_rate": 1.9363183063206097e-06, - "loss": 0.0283, - "step": 934 - }, - { - "epoch": 5.736196319018405, - "grad_norm": 3.9701411724090576, - "learning_rate": 1.931625038628577e-06, - "loss": 0.0948, - "step": 935 - }, - { - "epoch": 5.742331288343558, - "grad_norm": 3.0636157989501953, - "learning_rate": 1.9269338822822047e-06, - "loss": 0.0769, - "step": 936 - }, - { - "epoch": 5.748466257668712, - "grad_norm": 3.3671388626098633, - "learning_rate": 1.9222448547077573e-06, - "loss": 0.098, - "step": 937 - }, - { - "epoch": 5.754601226993865, - "grad_norm": 3.0725975036621094, - "learning_rate": 1.917557973323591e-06, - "loss": 0.0363, - "step": 938 - }, - { - "epoch": 5.7607361963190185, - "grad_norm": 2.5592041015625, - "learning_rate": 1.9128732555400915e-06, - "loss": 0.0205, - "step": 939 - }, - { - "epoch": 5.766871165644172, - "grad_norm": 2.835740804672241, - "learning_rate": 1.9081907187596054e-06, - "loss": 0.0548, - "step": 940 - }, - { - "epoch": 5.773006134969325, - "grad_norm": 3.3596746921539307, - "learning_rate": 1.9035103803763793e-06, - "loss": 0.0454, - "step": 941 - }, - { - "epoch": 5.779141104294479, - "grad_norm": 3.226579427719116, - "learning_rate": 1.8988322577764918e-06, - "loss": 0.0514, - "step": 942 - }, - { - "epoch": 5.785276073619632, - "grad_norm": 3.2044687271118164, - "learning_rate": 1.8941563683377905e-06, - "loss": 0.1361, - "step": 943 - }, - { - "epoch": 5.791411042944786, - "grad_norm": 1.8300527334213257, - "learning_rate": 1.8894827294298296e-06, - "loss": 0.0139, - "step": 944 - }, - { - "epoch": 5.7975460122699385, - "grad_norm": 2.503735303878784, - "learning_rate": 1.884811358413801e-06, - "loss": 0.0311, - "step": 945 - }, - { - "epoch": 5.803680981595092, - "grad_norm": 2.171309471130371, - "learning_rate": 1.8801422726424735e-06, - "loss": 0.0227, - "step": 946 - }, - { - "epoch": 5.809815950920245, - "grad_norm": 1.8116636276245117, - "learning_rate": 1.8754754894601252e-06, - "loss": 0.0157, - "step": 947 - }, - { - "epoch": 5.815950920245399, - "grad_norm": 3.1412570476531982, - "learning_rate": 1.870811026202482e-06, - "loss": 0.1093, - "step": 948 - }, - { - "epoch": 5.822085889570552, - "grad_norm": 2.3962290287017822, - "learning_rate": 1.8661489001966526e-06, - "loss": 0.021, - "step": 949 - }, - { - "epoch": 5.828220858895706, - "grad_norm": 4.169166564941406, - "learning_rate": 1.8614891287610621e-06, - "loss": 0.0663, - "step": 950 - }, - { - "epoch": 5.8343558282208585, - "grad_norm": 3.1181528568267822, - "learning_rate": 1.8568317292053894e-06, - "loss": 0.1008, - "step": 951 - }, - { - "epoch": 5.840490797546012, - "grad_norm": 3.5155029296875, - "learning_rate": 1.8521767188305023e-06, - "loss": 0.0451, - "step": 952 - }, - { - "epoch": 5.846625766871165, - "grad_norm": 2.975693702697754, - "learning_rate": 1.8475241149283957e-06, - "loss": 0.0561, - "step": 953 - }, - { - "epoch": 5.852760736196319, - "grad_norm": 2.1581289768218994, - "learning_rate": 1.842873934782122e-06, - "loss": 0.0265, - "step": 954 - }, - { - "epoch": 5.858895705521473, - "grad_norm": 2.6281228065490723, - "learning_rate": 1.8382261956657318e-06, - "loss": 0.1196, - "step": 955 - }, - { - "epoch": 5.865030674846626, - "grad_norm": 2.9569528102874756, - "learning_rate": 1.8335809148442074e-06, - "loss": 0.1356, - "step": 956 - }, - { - "epoch": 5.871165644171779, - "grad_norm": 2.450949192047119, - "learning_rate": 1.8289381095734005e-06, - "loss": 0.0444, - "step": 957 - }, - { - "epoch": 5.877300613496932, - "grad_norm": 2.1737027168273926, - "learning_rate": 1.8242977970999643e-06, - "loss": 0.0622, - "step": 958 - }, - { - "epoch": 5.883435582822086, - "grad_norm": 3.350647211074829, - "learning_rate": 1.8196599946612956e-06, - "loss": 0.0762, - "step": 959 - }, - { - "epoch": 5.889570552147239, - "grad_norm": 2.5031936168670654, - "learning_rate": 1.8150247194854642e-06, - "loss": 0.0207, - "step": 960 - }, - { - "epoch": 5.895705521472393, - "grad_norm": 3.7103707790374756, - "learning_rate": 1.8103919887911525e-06, - "loss": 0.1122, - "step": 961 - }, - { - "epoch": 5.901840490797546, - "grad_norm": 2.485322952270508, - "learning_rate": 1.8057618197875914e-06, - "loss": 0.0284, - "step": 962 - }, - { - "epoch": 5.9079754601226995, - "grad_norm": 1.903212547302246, - "learning_rate": 1.8011342296744961e-06, - "loss": 0.0239, - "step": 963 - }, - { - "epoch": 5.914110429447852, - "grad_norm": 3.015552520751953, - "learning_rate": 1.796509235642001e-06, - "loss": 0.0425, - "step": 964 - }, - { - "epoch": 5.920245398773006, - "grad_norm": 4.806198596954346, - "learning_rate": 1.7918868548705982e-06, - "loss": 0.2094, - "step": 965 - }, - { - "epoch": 5.92638036809816, - "grad_norm": 2.949596643447876, - "learning_rate": 1.7872671045310703e-06, - "loss": 0.0632, - "step": 966 - }, - { - "epoch": 5.932515337423313, - "grad_norm": 4.153099536895752, - "learning_rate": 1.782650001784431e-06, - "loss": 0.1411, - "step": 967 - }, - { - "epoch": 5.938650306748467, - "grad_norm": 3.4117565155029297, - "learning_rate": 1.7780355637818568e-06, - "loss": 0.0965, - "step": 968 - }, - { - "epoch": 5.9447852760736195, - "grad_norm": 2.533405303955078, - "learning_rate": 1.7734238076646277e-06, - "loss": 0.0568, - "step": 969 - }, - { - "epoch": 5.950920245398773, - "grad_norm": 2.3604726791381836, - "learning_rate": 1.7688147505640581e-06, - "loss": 0.0182, - "step": 970 - }, - { - "epoch": 5.957055214723926, - "grad_norm": 3.807424306869507, - "learning_rate": 1.7642084096014405e-06, - "loss": 0.0547, - "step": 971 - }, - { - "epoch": 5.96319018404908, - "grad_norm": 2.5735342502593994, - "learning_rate": 1.759604801887974e-06, - "loss": 0.0775, - "step": 972 - }, - { - "epoch": 5.969325153374233, - "grad_norm": 2.9217734336853027, - "learning_rate": 1.7550039445247069e-06, - "loss": 0.0541, - "step": 973 - }, - { - "epoch": 5.975460122699387, - "grad_norm": 2.793104410171509, - "learning_rate": 1.7504058546024694e-06, - "loss": 0.0257, - "step": 974 - }, - { - "epoch": 5.9815950920245395, - "grad_norm": 3.5610134601593018, - "learning_rate": 1.7458105492018114e-06, - "loss": 0.0767, - "step": 975 - }, - { - "epoch": 5.987730061349693, - "grad_norm": 2.0738015174865723, - "learning_rate": 1.7412180453929412e-06, - "loss": 0.025, - "step": 976 - }, - { - "epoch": 5.993865030674847, - "grad_norm": 2.1248421669006348, - "learning_rate": 1.736628360235657e-06, - "loss": 0.0183, - "step": 977 - }, - { - "epoch": 6.0, - "grad_norm": 2.901273727416992, - "learning_rate": 1.7320415107792893e-06, - "loss": 0.1369, - "step": 978 - }, - { - "epoch": 6.006134969325154, - "grad_norm": 3.815110683441162, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.1011, - "step": 979 - }, - { - "epoch": 6.012269938650307, - "grad_norm": 2.421208381652832, - "learning_rate": 1.7228763871138845e-06, - "loss": 0.0105, - "step": 980 - }, - { - "epoch": 6.0184049079754605, - "grad_norm": 2.7103846073150635, - "learning_rate": 1.718298146950585e-06, - "loss": 0.0373, - "step": 981 - }, - { - "epoch": 6.024539877300613, - "grad_norm": 1.3751411437988281, - "learning_rate": 1.7137228105795473e-06, - "loss": 0.0072, - "step": 982 - }, - { - "epoch": 6.030674846625767, - "grad_norm": 1.5235071182250977, - "learning_rate": 1.7091503949967987e-06, - "loss": 0.0126, - "step": 983 - }, - { - "epoch": 6.03680981595092, - "grad_norm": 2.0652546882629395, - "learning_rate": 1.7045809171875183e-06, - "loss": 0.0198, - "step": 984 - }, - { - "epoch": 6.042944785276074, - "grad_norm": 2.010207176208496, - "learning_rate": 1.70001439412597e-06, - "loss": 0.0186, - "step": 985 - }, - { - "epoch": 6.049079754601227, - "grad_norm": 2.0444021224975586, - "learning_rate": 1.6954508427754435e-06, - "loss": 0.0197, - "step": 986 - }, - { - "epoch": 6.0552147239263805, - "grad_norm": 2.6540091037750244, - "learning_rate": 1.690890280088187e-06, - "loss": 0.0192, - "step": 987 - }, - { - "epoch": 6.061349693251533, - "grad_norm": 1.6479653120040894, - "learning_rate": 1.6863327230053506e-06, - "loss": 0.0105, - "step": 988 - }, - { - "epoch": 6.067484662576687, - "grad_norm": 2.4434754848480225, - "learning_rate": 1.6817781884569146e-06, - "loss": 0.0275, - "step": 989 - }, - { - "epoch": 6.07361963190184, - "grad_norm": 1.7472137212753296, - "learning_rate": 1.677226693361636e-06, - "loss": 0.0095, - "step": 990 - }, - { - "epoch": 6.079754601226994, - "grad_norm": 2.952821969985962, - "learning_rate": 1.6726782546269793e-06, - "loss": 0.0483, - "step": 991 - }, - { - "epoch": 6.085889570552148, - "grad_norm": 3.123959541320801, - "learning_rate": 1.6681328891490544e-06, - "loss": 0.0815, - "step": 992 - }, - { - "epoch": 6.0920245398773005, - "grad_norm": 2.9924800395965576, - "learning_rate": 1.663590613812556e-06, - "loss": 0.0216, - "step": 993 - }, - { - "epoch": 6.098159509202454, - "grad_norm": 2.417778730392456, - "learning_rate": 1.6590514454907007e-06, - "loss": 0.0243, - "step": 994 - }, - { - "epoch": 6.104294478527607, - "grad_norm": 2.0682942867279053, - "learning_rate": 1.6545154010451613e-06, - "loss": 0.0669, - "step": 995 - }, - { - "epoch": 6.110429447852761, - "grad_norm": 2.9801135063171387, - "learning_rate": 1.6499824973260086e-06, - "loss": 0.0309, - "step": 996 - }, - { - "epoch": 6.116564417177914, - "grad_norm": 1.5753487348556519, - "learning_rate": 1.645452751171645e-06, - "loss": 0.026, - "step": 997 - }, - { - "epoch": 6.122699386503068, - "grad_norm": 2.461124897003174, - "learning_rate": 1.6409261794087438e-06, - "loss": 0.0191, - "step": 998 - }, - { - "epoch": 6.128834355828221, - "grad_norm": 3.839308261871338, - "learning_rate": 1.6364027988521875e-06, - "loss": 0.045, - "step": 999 - }, - { - "epoch": 6.134969325153374, - "grad_norm": 2.9653189182281494, - "learning_rate": 1.6318826263050022e-06, - "loss": 0.0197, - "step": 1000 - }, - { - "epoch": 6.141104294478527, - "grad_norm": 1.1804074048995972, - "learning_rate": 1.6273656785582986e-06, - "loss": 0.0092, - "step": 1001 - }, - { - "epoch": 6.147239263803681, - "grad_norm": 1.9027175903320312, - "learning_rate": 1.6228519723912073e-06, - "loss": 0.0141, - "step": 1002 - }, - { - "epoch": 6.153374233128835, - "grad_norm": 1.831039309501648, - "learning_rate": 1.618341524570819e-06, - "loss": 0.0131, - "step": 1003 - }, - { - "epoch": 6.159509202453988, - "grad_norm": 2.547327756881714, - "learning_rate": 1.613834351852119e-06, - "loss": 0.0686, - "step": 1004 - }, - { - "epoch": 6.1656441717791415, - "grad_norm": 2.746947765350342, - "learning_rate": 1.6093304709779273e-06, - "loss": 0.036, - "step": 1005 - }, - { - "epoch": 6.171779141104294, - "grad_norm": 2.0104732513427734, - "learning_rate": 1.6048298986788345e-06, - "loss": 0.0216, - "step": 1006 - }, - { - "epoch": 6.177914110429448, - "grad_norm": 2.655977725982666, - "learning_rate": 1.6003326516731431e-06, - "loss": 0.024, - "step": 1007 - }, - { - "epoch": 6.184049079754601, - "grad_norm": 2.0733132362365723, - "learning_rate": 1.5958387466668015e-06, - "loss": 0.0133, - "step": 1008 - }, - { - "epoch": 6.190184049079755, - "grad_norm": 2.5398054122924805, - "learning_rate": 1.5913482003533437e-06, - "loss": 0.0331, - "step": 1009 - }, - { - "epoch": 6.196319018404908, - "grad_norm": 1.7983721494674683, - "learning_rate": 1.5868610294138264e-06, - "loss": 0.0111, - "step": 1010 - }, - { - "epoch": 6.2024539877300615, - "grad_norm": 1.7259647846221924, - "learning_rate": 1.58237725051677e-06, - "loss": 0.0112, - "step": 1011 - }, - { - "epoch": 6.208588957055214, - "grad_norm": 1.7722725868225098, - "learning_rate": 1.577896880318093e-06, - "loss": 0.0181, - "step": 1012 - }, - { - "epoch": 6.214723926380368, - "grad_norm": 3.633545398712158, - "learning_rate": 1.5734199354610513e-06, - "loss": 0.0135, - "step": 1013 - }, - { - "epoch": 6.220858895705521, - "grad_norm": 1.8951494693756104, - "learning_rate": 1.5689464325761764e-06, - "loss": 0.0163, - "step": 1014 - }, - { - "epoch": 6.226993865030675, - "grad_norm": 1.637170433998108, - "learning_rate": 1.564476388281216e-06, - "loss": 0.0068, - "step": 1015 - }, - { - "epoch": 6.233128834355828, - "grad_norm": 2.2963850498199463, - "learning_rate": 1.5600098191810682e-06, - "loss": 0.021, - "step": 1016 - }, - { - "epoch": 6.2392638036809815, - "grad_norm": 2.777996063232422, - "learning_rate": 1.555546741867722e-06, - "loss": 0.0349, - "step": 1017 - }, - { - "epoch": 6.245398773006135, - "grad_norm": 2.1580724716186523, - "learning_rate": 1.5510871729201953e-06, - "loss": 0.0626, - "step": 1018 - }, - { - "epoch": 6.251533742331288, - "grad_norm": 1.4158363342285156, - "learning_rate": 1.5466311289044755e-06, - "loss": 0.0082, - "step": 1019 - }, - { - "epoch": 6.257668711656442, - "grad_norm": 3.287564516067505, - "learning_rate": 1.5421786263734524e-06, - "loss": 0.0212, - "step": 1020 - }, - { - "epoch": 6.263803680981595, - "grad_norm": 2.4552016258239746, - "learning_rate": 1.5377296818668638e-06, - "loss": 0.0963, - "step": 1021 - }, - { - "epoch": 6.269938650306749, - "grad_norm": 1.877556562423706, - "learning_rate": 1.5332843119112285e-06, - "loss": 0.011, - "step": 1022 - }, - { - "epoch": 6.276073619631902, - "grad_norm": 3.720372438430786, - "learning_rate": 1.5288425330197864e-06, - "loss": 0.018, - "step": 1023 - }, - { - "epoch": 6.282208588957055, - "grad_norm": 1.9751925468444824, - "learning_rate": 1.5244043616924389e-06, - "loss": 0.0162, - "step": 1024 - }, - { - "epoch": 6.288343558282208, - "grad_norm": 2.5137453079223633, - "learning_rate": 1.5199698144156865e-06, - "loss": 0.0468, - "step": 1025 - }, - { - "epoch": 6.294478527607362, - "grad_norm": 2.111983299255371, - "learning_rate": 1.5155389076625663e-06, - "loss": 0.0064, - "step": 1026 - }, - { - "epoch": 6.300613496932515, - "grad_norm": 2.572223663330078, - "learning_rate": 1.5111116578925924e-06, - "loss": 0.035, - "step": 1027 - }, - { - "epoch": 6.306748466257669, - "grad_norm": 2.7881019115448, - "learning_rate": 1.5066880815516943e-06, - "loss": 0.0197, - "step": 1028 - }, - { - "epoch": 6.3128834355828225, - "grad_norm": 1.2287017107009888, - "learning_rate": 1.5022681950721565e-06, - "loss": 0.0059, - "step": 1029 - }, - { - "epoch": 6.319018404907975, - "grad_norm": 1.764028549194336, - "learning_rate": 1.4978520148725558e-06, - "loss": 0.006, - "step": 1030 - }, - { - "epoch": 6.325153374233129, - "grad_norm": 2.399787664413452, - "learning_rate": 1.4934395573577016e-06, - "loss": 0.0126, - "step": 1031 - }, - { - "epoch": 6.331288343558282, - "grad_norm": 1.9056172370910645, - "learning_rate": 1.4890308389185743e-06, - "loss": 0.0131, - "step": 1032 - }, - { - "epoch": 6.337423312883436, - "grad_norm": 1.7394744157791138, - "learning_rate": 1.484625875932265e-06, - "loss": 0.016, - "step": 1033 - }, - { - "epoch": 6.343558282208589, - "grad_norm": 4.352719306945801, - "learning_rate": 1.480224684761915e-06, - "loss": 0.1059, - "step": 1034 - }, - { - "epoch": 6.3496932515337425, - "grad_norm": 2.148385524749756, - "learning_rate": 1.4758272817566538e-06, - "loss": 0.0312, - "step": 1035 - }, - { - "epoch": 6.355828220858895, - "grad_norm": 2.483872175216675, - "learning_rate": 1.4714336832515386e-06, - "loss": 0.0215, - "step": 1036 - }, - { - "epoch": 6.361963190184049, - "grad_norm": 2.6151270866394043, - "learning_rate": 1.467043905567494e-06, - "loss": 0.0718, - "step": 1037 - }, - { - "epoch": 6.368098159509202, - "grad_norm": 2.554600954055786, - "learning_rate": 1.4626579650112533e-06, - "loss": 0.0166, - "step": 1038 - }, - { - "epoch": 6.374233128834356, - "grad_norm": 3.013974905014038, - "learning_rate": 1.4582758778752926e-06, - "loss": 0.0448, - "step": 1039 - }, - { - "epoch": 6.38036809815951, - "grad_norm": 2.1542789936065674, - "learning_rate": 1.4538976604377781e-06, - "loss": 0.0297, - "step": 1040 - }, - { - "epoch": 6.386503067484663, - "grad_norm": 3.4402377605438232, - "learning_rate": 1.449523328962496e-06, - "loss": 0.0409, - "step": 1041 - }, - { - "epoch": 6.392638036809816, - "grad_norm": 1.6200538873672485, - "learning_rate": 1.4451528996988018e-06, - "loss": 0.0127, - "step": 1042 - }, - { - "epoch": 6.398773006134969, - "grad_norm": 3.081733465194702, - "learning_rate": 1.4407863888815527e-06, - "loss": 0.0788, - "step": 1043 - }, - { - "epoch": 6.404907975460123, - "grad_norm": 1.9813143014907837, - "learning_rate": 1.436423812731051e-06, - "loss": 0.0082, - "step": 1044 - }, - { - "epoch": 6.411042944785276, - "grad_norm": 1.7354048490524292, - "learning_rate": 1.432065187452984e-06, - "loss": 0.0086, - "step": 1045 - }, - { - "epoch": 6.41717791411043, - "grad_norm": 1.8812576532363892, - "learning_rate": 1.4277105292383594e-06, - "loss": 0.04, - "step": 1046 - }, - { - "epoch": 6.423312883435583, - "grad_norm": 1.117837905883789, - "learning_rate": 1.4233598542634519e-06, - "loss": 0.0054, - "step": 1047 - }, - { - "epoch": 6.429447852760736, - "grad_norm": 1.9587867259979248, - "learning_rate": 1.4190131786897388e-06, - "loss": 0.0263, - "step": 1048 - }, - { - "epoch": 6.435582822085889, - "grad_norm": 1.2712376117706299, - "learning_rate": 1.4146705186638388e-06, - "loss": 0.0098, - "step": 1049 - }, - { - "epoch": 6.441717791411043, - "grad_norm": 2.6563849449157715, - "learning_rate": 1.410331890317457e-06, - "loss": 0.0322, - "step": 1050 - }, - { - "epoch": 6.447852760736196, - "grad_norm": 3.136518955230713, - "learning_rate": 1.4059973097673187e-06, - "loss": 0.0729, - "step": 1051 - }, - { - "epoch": 6.45398773006135, - "grad_norm": 1.3937572240829468, - "learning_rate": 1.4016667931151156e-06, - "loss": 0.0094, - "step": 1052 - }, - { - "epoch": 6.460122699386503, - "grad_norm": 1.7218928337097168, - "learning_rate": 1.3973403564474422e-06, - "loss": 0.0078, - "step": 1053 - }, - { - "epoch": 6.466257668711656, - "grad_norm": 2.35612416267395, - "learning_rate": 1.393018015835737e-06, - "loss": 0.0231, - "step": 1054 - }, - { - "epoch": 6.47239263803681, - "grad_norm": 1.96125066280365, - "learning_rate": 1.388699787336224e-06, - "loss": 0.0153, - "step": 1055 - }, - { - "epoch": 6.478527607361963, - "grad_norm": 2.1789233684539795, - "learning_rate": 1.3843856869898486e-06, - "loss": 0.0136, - "step": 1056 - }, - { - "epoch": 6.484662576687117, - "grad_norm": 3.1261701583862305, - "learning_rate": 1.3800757308222263e-06, - "loss": 0.0819, - "step": 1057 - }, - { - "epoch": 6.49079754601227, - "grad_norm": 2.93422794342041, - "learning_rate": 1.3757699348435726e-06, - "loss": 0.0658, - "step": 1058 - }, - { - "epoch": 6.4969325153374236, - "grad_norm": 2.1311776638031006, - "learning_rate": 1.3714683150486534e-06, - "loss": 0.0106, - "step": 1059 - }, - { - "epoch": 6.5030674846625764, - "grad_norm": 1.699877381324768, - "learning_rate": 1.3671708874167211e-06, - "loss": 0.0151, - "step": 1060 - }, - { - "epoch": 6.50920245398773, - "grad_norm": 1.7288825511932373, - "learning_rate": 1.3628776679114516e-06, - "loss": 0.0114, - "step": 1061 - }, - { - "epoch": 6.515337423312883, - "grad_norm": 1.8437966108322144, - "learning_rate": 1.3585886724808934e-06, - "loss": 0.0117, - "step": 1062 - }, - { - "epoch": 6.521472392638037, - "grad_norm": 3.073568344116211, - "learning_rate": 1.3543039170574022e-06, - "loss": 0.0381, - "step": 1063 - }, - { - "epoch": 6.52760736196319, - "grad_norm": 1.6069157123565674, - "learning_rate": 1.350023417557581e-06, - "loss": 0.0072, - "step": 1064 - }, - { - "epoch": 6.533742331288344, - "grad_norm": 2.48502779006958, - "learning_rate": 1.345747189882228e-06, - "loss": 0.0302, - "step": 1065 - }, - { - "epoch": 6.539877300613497, - "grad_norm": 1.6879143714904785, - "learning_rate": 1.3414752499162676e-06, - "loss": 0.0095, - "step": 1066 - }, - { - "epoch": 6.54601226993865, - "grad_norm": 2.2126848697662354, - "learning_rate": 1.3372076135287005e-06, - "loss": 0.067, - "step": 1067 - }, - { - "epoch": 6.552147239263804, - "grad_norm": 2.157269239425659, - "learning_rate": 1.33294429657254e-06, - "loss": 0.0203, - "step": 1068 - }, - { - "epoch": 6.558282208588957, - "grad_norm": 2.725158452987671, - "learning_rate": 1.3286853148847523e-06, - "loss": 0.0217, - "step": 1069 - }, - { - "epoch": 6.564417177914111, - "grad_norm": 2.478426456451416, - "learning_rate": 1.3244306842862007e-06, - "loss": 0.0223, - "step": 1070 - }, - { - "epoch": 6.570552147239264, - "grad_norm": 2.349463939666748, - "learning_rate": 1.3201804205815872e-06, - "loss": 0.027, - "step": 1071 - }, - { - "epoch": 6.576687116564417, - "grad_norm": 2.049593210220337, - "learning_rate": 1.3159345395593876e-06, - "loss": 0.0212, - "step": 1072 - }, - { - "epoch": 6.58282208588957, - "grad_norm": 2.3445141315460205, - "learning_rate": 1.3116930569918024e-06, - "loss": 0.0182, - "step": 1073 - }, - { - "epoch": 6.588957055214724, - "grad_norm": 3.756135940551758, - "learning_rate": 1.3074559886346886e-06, - "loss": 0.1187, - "step": 1074 - }, - { - "epoch": 6.595092024539877, - "grad_norm": 2.4747114181518555, - "learning_rate": 1.3032233502275089e-06, - "loss": 0.0103, - "step": 1075 - }, - { - "epoch": 6.601226993865031, - "grad_norm": 2.0029311180114746, - "learning_rate": 1.2989951574932693e-06, - "loss": 0.0115, - "step": 1076 - }, - { - "epoch": 6.6073619631901845, - "grad_norm": 2.007141351699829, - "learning_rate": 1.2947714261384602e-06, - "loss": 0.0155, - "step": 1077 - }, - { - "epoch": 6.613496932515337, - "grad_norm": 1.5075048208236694, - "learning_rate": 1.2905521718530012e-06, - "loss": 0.0125, - "step": 1078 - }, - { - "epoch": 6.61963190184049, - "grad_norm": 1.9235132932662964, - "learning_rate": 1.2863374103101784e-06, - "loss": 0.0181, - "step": 1079 - }, - { - "epoch": 6.625766871165644, - "grad_norm": 1.7235040664672852, - "learning_rate": 1.2821271571665912e-06, - "loss": 0.0102, - "step": 1080 - }, - { - "epoch": 6.631901840490798, - "grad_norm": 3.503974676132202, - "learning_rate": 1.277921428062091e-06, - "loss": 0.0969, - "step": 1081 - }, - { - "epoch": 6.638036809815951, - "grad_norm": 2.4633288383483887, - "learning_rate": 1.2737202386197222e-06, - "loss": 0.0383, - "step": 1082 - }, - { - "epoch": 6.644171779141105, - "grad_norm": 2.332341432571411, - "learning_rate": 1.2695236044456672e-06, - "loss": 0.0184, - "step": 1083 - }, - { - "epoch": 6.6503067484662575, - "grad_norm": 2.8279805183410645, - "learning_rate": 1.2653315411291867e-06, - "loss": 0.0327, - "step": 1084 - }, - { - "epoch": 6.656441717791411, - "grad_norm": 2.444810628890991, - "learning_rate": 1.2611440642425617e-06, - "loss": 0.0399, - "step": 1085 - }, - { - "epoch": 6.662576687116564, - "grad_norm": 2.9304957389831543, - "learning_rate": 1.2569611893410374e-06, - "loss": 0.0385, - "step": 1086 - }, - { - "epoch": 6.668711656441718, - "grad_norm": 2.1244678497314453, - "learning_rate": 1.2527829319627604e-06, - "loss": 0.0123, - "step": 1087 - }, - { - "epoch": 6.674846625766871, - "grad_norm": 2.129033327102661, - "learning_rate": 1.248609307628729e-06, - "loss": 0.0302, - "step": 1088 - }, - { - "epoch": 6.680981595092025, - "grad_norm": 5.788925647735596, - "learning_rate": 1.2444403318427268e-06, - "loss": 0.0296, - "step": 1089 - }, - { - "epoch": 6.6871165644171775, - "grad_norm": 5.127935886383057, - "learning_rate": 1.2402760200912725e-06, - "loss": 0.1532, - "step": 1090 - }, - { - "epoch": 6.693251533742331, - "grad_norm": 2.2610318660736084, - "learning_rate": 1.2361163878435594e-06, - "loss": 0.0126, - "step": 1091 - }, - { - "epoch": 6.699386503067485, - "grad_norm": 1.7913328409194946, - "learning_rate": 1.2319614505513953e-06, - "loss": 0.0086, - "step": 1092 - }, - { - "epoch": 6.705521472392638, - "grad_norm": 1.5961267948150635, - "learning_rate": 1.227811223649149e-06, - "loss": 0.0041, - "step": 1093 - }, - { - "epoch": 6.711656441717792, - "grad_norm": 1.441754937171936, - "learning_rate": 1.2236657225536938e-06, - "loss": 0.0103, - "step": 1094 - }, - { - "epoch": 6.717791411042945, - "grad_norm": 1.4393174648284912, - "learning_rate": 1.2195249626643432e-06, - "loss": 0.0063, - "step": 1095 - }, - { - "epoch": 6.723926380368098, - "grad_norm": 3.199451208114624, - "learning_rate": 1.2153889593628032e-06, - "loss": 0.0571, - "step": 1096 - }, - { - "epoch": 6.730061349693251, - "grad_norm": 2.1796770095825195, - "learning_rate": 1.211257728013107e-06, - "loss": 0.0269, - "step": 1097 - }, - { - "epoch": 6.736196319018405, - "grad_norm": 3.1798806190490723, - "learning_rate": 1.2071312839615634e-06, - "loss": 0.0396, - "step": 1098 - }, - { - "epoch": 6.742331288343558, - "grad_norm": 3.063633680343628, - "learning_rate": 1.2030096425366985e-06, - "loss": 0.0261, - "step": 1099 - }, - { - "epoch": 6.748466257668712, - "grad_norm": 1.860409140586853, - "learning_rate": 1.1988928190491948e-06, - "loss": 0.013, - "step": 1100 - }, - { - "epoch": 6.754601226993865, - "grad_norm": 1.9303224086761475, - "learning_rate": 1.1947808287918406e-06, - "loss": 0.0113, - "step": 1101 - }, - { - "epoch": 6.7607361963190185, - "grad_norm": 2.1432337760925293, - "learning_rate": 1.19067368703947e-06, - "loss": 0.0195, - "step": 1102 - }, - { - "epoch": 6.766871165644172, - "grad_norm": 1.8998470306396484, - "learning_rate": 1.1865714090489038e-06, - "loss": 0.0105, - "step": 1103 - }, - { - "epoch": 6.773006134969325, - "grad_norm": 2.3260247707366943, - "learning_rate": 1.1824740100588991e-06, - "loss": 0.0554, - "step": 1104 - }, - { - "epoch": 6.779141104294479, - "grad_norm": 1.9272006750106812, - "learning_rate": 1.1783815052900848e-06, - "loss": 0.0118, - "step": 1105 - }, - { - "epoch": 6.785276073619632, - "grad_norm": 3.1646785736083984, - "learning_rate": 1.1742939099449126e-06, - "loss": 0.0901, - "step": 1106 - }, - { - "epoch": 6.791411042944786, - "grad_norm": 3.357422351837158, - "learning_rate": 1.1702112392075966e-06, - "loss": 0.0833, - "step": 1107 - }, - { - "epoch": 6.7975460122699385, - "grad_norm": 1.4302526712417603, - "learning_rate": 1.1661335082440545e-06, - "loss": 0.0078, - "step": 1108 - }, - { - "epoch": 6.803680981595092, - "grad_norm": 1.3046417236328125, - "learning_rate": 1.1620607322018587e-06, - "loss": 0.0092, - "step": 1109 - }, - { - "epoch": 6.809815950920245, - "grad_norm": 2.084237813949585, - "learning_rate": 1.1579929262101712e-06, - "loss": 0.0283, - "step": 1110 - }, - { - "epoch": 6.815950920245399, - "grad_norm": 1.9403250217437744, - "learning_rate": 1.153930105379695e-06, - "loss": 0.0066, - "step": 1111 - }, - { - "epoch": 6.822085889570552, - "grad_norm": 2.282449722290039, - "learning_rate": 1.1498722848026142e-06, - "loss": 0.0402, - "step": 1112 - }, - { - "epoch": 6.828220858895706, - "grad_norm": 1.9357627630233765, - "learning_rate": 1.1458194795525354e-06, - "loss": 0.0101, - "step": 1113 - }, - { - "epoch": 6.8343558282208585, - "grad_norm": 2.0236339569091797, - "learning_rate": 1.1417717046844385e-06, - "loss": 0.0109, - "step": 1114 - }, - { - "epoch": 6.840490797546012, - "grad_norm": 2.386857032775879, - "learning_rate": 1.137728975234615e-06, - "loss": 0.0297, - "step": 1115 - }, - { - "epoch": 6.846625766871165, - "grad_norm": 2.2477970123291016, - "learning_rate": 1.1336913062206157e-06, - "loss": 0.0393, - "step": 1116 - }, - { - "epoch": 6.852760736196319, - "grad_norm": 2.7217776775360107, - "learning_rate": 1.129658712641192e-06, - "loss": 0.0269, - "step": 1117 - }, - { - "epoch": 6.858895705521473, - "grad_norm": 2.6717259883880615, - "learning_rate": 1.125631209476241e-06, - "loss": 0.0708, - "step": 1118 - }, - { - "epoch": 6.865030674846626, - "grad_norm": 2.951939344406128, - "learning_rate": 1.1216088116867524e-06, - "loss": 0.0835, - "step": 1119 - }, - { - "epoch": 6.871165644171779, - "grad_norm": 1.9705166816711426, - "learning_rate": 1.1175915342147486e-06, - "loss": 0.0107, - "step": 1120 - }, - { - "epoch": 6.877300613496932, - "grad_norm": 2.4005937576293945, - "learning_rate": 1.1135793919832336e-06, - "loss": 0.0139, - "step": 1121 - }, - { - "epoch": 6.883435582822086, - "grad_norm": 2.277463674545288, - "learning_rate": 1.1095723998961353e-06, - "loss": 0.0154, - "step": 1122 - }, - { - "epoch": 6.889570552147239, - "grad_norm": 1.5026034116744995, - "learning_rate": 1.1055705728382482e-06, - "loss": 0.0072, - "step": 1123 - }, - { - "epoch": 6.895705521472393, - "grad_norm": 1.9540379047393799, - "learning_rate": 1.1015739256751826e-06, - "loss": 0.0202, - "step": 1124 - }, - { - "epoch": 6.901840490797546, - "grad_norm": 2.3090603351593018, - "learning_rate": 1.0975824732533066e-06, - "loss": 0.0559, - "step": 1125 - }, - { - "epoch": 6.9079754601226995, - "grad_norm": 2.100283622741699, - "learning_rate": 1.09359623039969e-06, - "loss": 0.0385, - "step": 1126 - }, - { - "epoch": 6.914110429447852, - "grad_norm": 2.4120566844940186, - "learning_rate": 1.0896152119220525e-06, - "loss": 0.0535, - "step": 1127 - }, - { - "epoch": 6.920245398773006, - "grad_norm": 2.003495454788208, - "learning_rate": 1.0856394326087045e-06, - "loss": 0.0104, - "step": 1128 - }, - { - "epoch": 6.92638036809816, - "grad_norm": 1.6565535068511963, - "learning_rate": 1.0816689072284962e-06, - "loss": 0.0121, - "step": 1129 - }, - { - "epoch": 6.932515337423313, - "grad_norm": 1.6503472328186035, - "learning_rate": 1.0777036505307616e-06, - "loss": 0.0056, - "step": 1130 - }, - { - "epoch": 6.938650306748467, - "grad_norm": 2.600112199783325, - "learning_rate": 1.0737436772452602e-06, - "loss": 0.0198, - "step": 1131 - }, - { - "epoch": 6.9447852760736195, - "grad_norm": 1.6668883562088013, - "learning_rate": 1.0697890020821292e-06, - "loss": 0.0077, - "step": 1132 - }, - { - "epoch": 6.950920245398773, - "grad_norm": 2.729172706604004, - "learning_rate": 1.0658396397318203e-06, - "loss": 0.0329, - "step": 1133 - }, - { - "epoch": 6.957055214723926, - "grad_norm": 1.5219136476516724, - "learning_rate": 1.061895604865053e-06, - "loss": 0.0113, - "step": 1134 - }, - { - "epoch": 6.96319018404908, - "grad_norm": 3.8395588397979736, - "learning_rate": 1.057956912132757e-06, - "loss": 0.0376, - "step": 1135 - }, - { - "epoch": 6.969325153374233, - "grad_norm": 2.4347221851348877, - "learning_rate": 1.054023576166014e-06, - "loss": 0.0517, - "step": 1136 - }, - { - "epoch": 6.975460122699387, - "grad_norm": 3.079165458679199, - "learning_rate": 1.0500956115760105e-06, - "loss": 0.0373, - "step": 1137 - }, - { - "epoch": 6.9815950920245395, - "grad_norm": 1.9391908645629883, - "learning_rate": 1.0461730329539794e-06, - "loss": 0.019, - "step": 1138 - }, - { - "epoch": 6.987730061349693, - "grad_norm": 1.8693119287490845, - "learning_rate": 1.0422558548711434e-06, - "loss": 0.0073, - "step": 1139 - }, - { - "epoch": 6.993865030674847, - "grad_norm": 3.0920307636260986, - "learning_rate": 1.0383440918786684e-06, - "loss": 0.0099, - "step": 1140 - }, - { - "epoch": 7.0, - "grad_norm": 3.184906244277954, - "learning_rate": 1.0344377585076e-06, - "loss": 0.0218, - "step": 1141 - }, - { - "epoch": 7.006134969325154, - "grad_norm": 0.7609673142433167, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.0024, - "step": 1142 - }, - { - "epoch": 7.012269938650307, - "grad_norm": 1.1493247747421265, - "learning_rate": 1.0266414386529775e-06, - "loss": 0.0059, - "step": 1143 - }, - { - "epoch": 7.0184049079754605, - "grad_norm": 3.534796953201294, - "learning_rate": 1.0227514811304556e-06, - "loss": 0.0843, - "step": 1144 - }, - { - "epoch": 7.024539877300613, - "grad_norm": 1.1876507997512817, - "learning_rate": 1.0188670111513002e-06, - "loss": 0.0098, - "step": 1145 - }, - { - "epoch": 7.030674846625767, - "grad_norm": 1.2825753688812256, - "learning_rate": 1.0149880431451736e-06, - "loss": 0.0042, - "step": 1146 - }, - { - "epoch": 7.03680981595092, - "grad_norm": 0.6842563152313232, - "learning_rate": 1.0111145915213e-06, - "loss": 0.003, - "step": 1147 - }, - { - "epoch": 7.042944785276074, - "grad_norm": 0.6310113072395325, - "learning_rate": 1.0072466706684127e-06, - "loss": 0.0027, - "step": 1148 - }, - { - "epoch": 7.049079754601227, - "grad_norm": 1.484761357307434, - "learning_rate": 1.0033842949546974e-06, - "loss": 0.0105, - "step": 1149 - }, - { - "epoch": 7.0552147239263805, - "grad_norm": 1.9790291786193848, - "learning_rate": 9.995274787277445e-07, - "loss": 0.0233, - "step": 1150 - }, - { - "epoch": 7.061349693251533, - "grad_norm": 1.1398522853851318, - "learning_rate": 9.956762363144892e-07, - "loss": 0.0031, - "step": 1151 - }, - { - "epoch": 7.067484662576687, - "grad_norm": 1.0574359893798828, - "learning_rate": 9.918305820211643e-07, - "loss": 0.0047, - "step": 1152 - }, - { - "epoch": 7.07361963190184, - "grad_norm": 2.463972330093384, - "learning_rate": 9.879905301332439e-07, - "loss": 0.0334, - "step": 1153 - }, - { - "epoch": 7.079754601226994, - "grad_norm": 1.4698575735092163, - "learning_rate": 9.84156094915389e-07, - "loss": 0.0191, - "step": 1154 - }, - { - "epoch": 7.085889570552148, - "grad_norm": 1.2635239362716675, - "learning_rate": 9.803272906113978e-07, - "loss": 0.0045, - "step": 1155 - }, - { - "epoch": 7.0920245398773005, - "grad_norm": 1.7271842956542969, - "learning_rate": 9.765041314441529e-07, - "loss": 0.0042, - "step": 1156 - }, - { - "epoch": 7.098159509202454, - "grad_norm": 1.5738918781280518, - "learning_rate": 9.72686631615563e-07, - "loss": 0.0066, - "step": 1157 - }, - { - "epoch": 7.104294478527607, - "grad_norm": 1.3097981214523315, - "learning_rate": 9.688748053065179e-07, - "loss": 0.0058, - "step": 1158 - }, - { - "epoch": 7.110429447852761, - "grad_norm": 2.076064348220825, - "learning_rate": 9.65068666676828e-07, - "loss": 0.0067, - "step": 1159 - }, - { - "epoch": 7.116564417177914, - "grad_norm": 1.1589064598083496, - "learning_rate": 9.612682298651792e-07, - "loss": 0.0052, - "step": 1160 - }, - { - "epoch": 7.122699386503068, - "grad_norm": 1.6450324058532715, - "learning_rate": 9.574735089890765e-07, - "loss": 0.0035, - "step": 1161 - }, - { - "epoch": 7.128834355828221, - "grad_norm": 1.6968387365341187, - "learning_rate": 9.53684518144789e-07, - "loss": 0.0126, - "step": 1162 - }, - { - "epoch": 7.134969325153374, - "grad_norm": 1.9047832489013672, - "learning_rate": 9.499012714073036e-07, - "loss": 0.0345, - "step": 1163 - }, - { - "epoch": 7.141104294478527, - "grad_norm": 1.7587796449661255, - "learning_rate": 9.461237828302666e-07, - "loss": 0.0144, - "step": 1164 - }, - { - "epoch": 7.147239263803681, - "grad_norm": 1.863775372505188, - "learning_rate": 9.423520664459374e-07, - "loss": 0.0135, - "step": 1165 - }, - { - "epoch": 7.153374233128835, - "grad_norm": 2.6580259799957275, - "learning_rate": 9.385861362651322e-07, - "loss": 0.0138, - "step": 1166 - }, - { - "epoch": 7.159509202453988, - "grad_norm": 2.086371421813965, - "learning_rate": 9.348260062771713e-07, - "loss": 0.0093, - "step": 1167 - }, - { - "epoch": 7.1656441717791415, - "grad_norm": 1.0806611776351929, - "learning_rate": 9.310716904498321e-07, - "loss": 0.003, - "step": 1168 - }, - { - "epoch": 7.171779141104294, - "grad_norm": 1.2487165927886963, - "learning_rate": 9.273232027292933e-07, - "loss": 0.0033, - "step": 1169 - }, - { - "epoch": 7.177914110429448, - "grad_norm": 1.0647703409194946, - "learning_rate": 9.235805570400813e-07, - "loss": 0.0024, - "step": 1170 - }, - { - "epoch": 7.184049079754601, - "grad_norm": 1.6039917469024658, - "learning_rate": 9.198437672850249e-07, - "loss": 0.0118, - "step": 1171 - }, - { - "epoch": 7.190184049079755, - "grad_norm": 2.199977159500122, - "learning_rate": 9.161128473451967e-07, - "loss": 0.0173, - "step": 1172 - }, - { - "epoch": 7.196319018404908, - "grad_norm": 2.51725697517395, - "learning_rate": 9.123878110798662e-07, - "loss": 0.0142, - "step": 1173 - }, - { - "epoch": 7.2024539877300615, - "grad_norm": 1.841742753982544, - "learning_rate": 9.086686723264474e-07, - "loss": 0.012, - "step": 1174 - }, - { - "epoch": 7.208588957055214, - "grad_norm": 1.212876319885254, - "learning_rate": 9.049554449004447e-07, - "loss": 0.0055, - "step": 1175 - }, - { - "epoch": 7.214723926380368, - "grad_norm": 1.3728275299072266, - "learning_rate": 9.012481425954053e-07, - "loss": 0.0043, - "step": 1176 - }, - { - "epoch": 7.220858895705521, - "grad_norm": 2.3055357933044434, - "learning_rate": 8.97546779182866e-07, - "loss": 0.0443, - "step": 1177 - }, - { - "epoch": 7.226993865030675, - "grad_norm": 2.017620801925659, - "learning_rate": 8.938513684123024e-07, - "loss": 0.0082, - "step": 1178 - }, - { - "epoch": 7.233128834355828, - "grad_norm": 1.5641282796859741, - "learning_rate": 8.901619240110781e-07, - "loss": 0.0071, - "step": 1179 - }, - { - "epoch": 7.2392638036809815, - "grad_norm": 1.3781960010528564, - "learning_rate": 8.864784596843917e-07, - "loss": 0.0056, - "step": 1180 - }, - { - "epoch": 7.245398773006135, - "grad_norm": 1.23178231716156, - "learning_rate": 8.828009891152301e-07, - "loss": 0.0076, - "step": 1181 - }, - { - "epoch": 7.251533742331288, - "grad_norm": 2.809582233428955, - "learning_rate": 8.791295259643126e-07, - "loss": 0.0141, - "step": 1182 - }, - { - "epoch": 7.257668711656442, - "grad_norm": 1.6520317792892456, - "learning_rate": 8.754640838700443e-07, - "loss": 0.01, - "step": 1183 - }, - { - "epoch": 7.263803680981595, - "grad_norm": 1.411852478981018, - "learning_rate": 8.718046764484648e-07, - "loss": 0.009, - "step": 1184 - }, - { - "epoch": 7.269938650306749, - "grad_norm": 2.9334425926208496, - "learning_rate": 8.681513172931935e-07, - "loss": 0.0291, - "step": 1185 - }, - { - "epoch": 7.276073619631902, - "grad_norm": 1.4273028373718262, - "learning_rate": 8.64504019975386e-07, - "loss": 0.0064, - "step": 1186 - }, - { - "epoch": 7.282208588957055, - "grad_norm": 1.9486448764801025, - "learning_rate": 8.608627980436765e-07, - "loss": 0.0135, - "step": 1187 - }, - { - "epoch": 7.288343558282208, - "grad_norm": 1.3740493059158325, - "learning_rate": 8.572276650241329e-07, - "loss": 0.0061, - "step": 1188 - }, - { - "epoch": 7.294478527607362, - "grad_norm": 1.3352797031402588, - "learning_rate": 8.535986344202057e-07, - "loss": 0.0051, - "step": 1189 - }, - { - "epoch": 7.300613496932515, - "grad_norm": 1.0336774587631226, - "learning_rate": 8.499757197126732e-07, - "loss": 0.0052, - "step": 1190 - }, - { - "epoch": 7.306748466257669, - "grad_norm": 1.1450837850570679, - "learning_rate": 8.463589343595976e-07, - "loss": 0.0111, - "step": 1191 - }, - { - "epoch": 7.3128834355828225, - "grad_norm": 2.504876136779785, - "learning_rate": 8.427482917962734e-07, - "loss": 0.0279, - "step": 1192 - }, - { - "epoch": 7.319018404907975, - "grad_norm": 1.569841980934143, - "learning_rate": 8.391438054351725e-07, - "loss": 0.0105, - "step": 1193 - }, - { - "epoch": 7.325153374233129, - "grad_norm": 1.218538761138916, - "learning_rate": 8.355454886659026e-07, - "loss": 0.0028, - "step": 1194 - }, - { - "epoch": 7.331288343558282, - "grad_norm": 2.084049940109253, - "learning_rate": 8.319533548551492e-07, - "loss": 0.0102, - "step": 1195 - }, - { - "epoch": 7.337423312883436, - "grad_norm": 2.326167345046997, - "learning_rate": 8.28367417346633e-07, - "loss": 0.0396, - "step": 1196 - }, - { - "epoch": 7.343558282208589, - "grad_norm": 1.2704310417175293, - "learning_rate": 8.247876894610568e-07, - "loss": 0.006, - "step": 1197 - }, - { - "epoch": 7.3496932515337425, - "grad_norm": 1.358012318611145, - "learning_rate": 8.212141844960544e-07, - "loss": 0.0075, - "step": 1198 - }, - { - "epoch": 7.355828220858895, - "grad_norm": 1.5145729780197144, - "learning_rate": 8.17646915726146e-07, - "loss": 0.0042, - "step": 1199 - }, - { - "epoch": 7.361963190184049, - "grad_norm": 1.203041911125183, - "learning_rate": 8.140858964026849e-07, - "loss": 0.0032, - "step": 1200 - }, - { - "epoch": 7.368098159509202, - "grad_norm": 3.031280279159546, - "learning_rate": 8.105311397538085e-07, - "loss": 0.032, - "step": 1201 - }, - { - "epoch": 7.374233128834356, - "grad_norm": 1.416698694229126, - "learning_rate": 8.069826589843929e-07, - "loss": 0.0185, - "step": 1202 - }, - { - "epoch": 7.38036809815951, - "grad_norm": 0.9656457901000977, - "learning_rate": 8.034404672759977e-07, - "loss": 0.0034, - "step": 1203 - }, - { - "epoch": 7.386503067484663, - "grad_norm": 1.7239291667938232, - "learning_rate": 7.99904577786823e-07, - "loss": 0.034, - "step": 1204 - }, - { - "epoch": 7.392638036809816, - "grad_norm": 1.1560636758804321, - "learning_rate": 7.963750036516585e-07, - "loss": 0.005, - "step": 1205 - }, - { - "epoch": 7.398773006134969, - "grad_norm": 1.057456374168396, - "learning_rate": 7.928517579818312e-07, - "loss": 0.0073, - "step": 1206 - }, - { - "epoch": 7.404907975460123, - "grad_norm": 1.4066674709320068, - "learning_rate": 7.893348538651635e-07, - "loss": 0.015, - "step": 1207 - }, - { - "epoch": 7.411042944785276, - "grad_norm": 1.1061445474624634, - "learning_rate": 7.858243043659161e-07, - "loss": 0.004, - "step": 1208 - }, - { - "epoch": 7.41717791411043, - "grad_norm": 0.9575282335281372, - "learning_rate": 7.823201225247496e-07, - "loss": 0.003, - "step": 1209 - }, - { - "epoch": 7.423312883435583, - "grad_norm": 1.3790507316589355, - "learning_rate": 7.788223213586677e-07, - "loss": 0.0096, - "step": 1210 - }, - { - "epoch": 7.429447852760736, - "grad_norm": 1.1366883516311646, - "learning_rate": 7.753309138609705e-07, - "loss": 0.006, - "step": 1211 - }, - { - "epoch": 7.435582822085889, - "grad_norm": 2.2659928798675537, - "learning_rate": 7.71845913001211e-07, - "loss": 0.0074, - "step": 1212 - }, - { - "epoch": 7.441717791411043, - "grad_norm": 1.2541831731796265, - "learning_rate": 7.683673317251392e-07, - "loss": 0.0051, - "step": 1213 - }, - { - "epoch": 7.447852760736196, - "grad_norm": 1.5959513187408447, - "learning_rate": 7.648951829546619e-07, - "loss": 0.0271, - "step": 1214 - }, - { - "epoch": 7.45398773006135, - "grad_norm": 1.368452548980713, - "learning_rate": 7.6142947958779e-07, - "loss": 0.0155, - "step": 1215 - }, - { - "epoch": 7.460122699386503, - "grad_norm": 1.1851825714111328, - "learning_rate": 7.579702344985899e-07, - "loss": 0.0032, - "step": 1216 - }, - { - "epoch": 7.466257668711656, - "grad_norm": 1.419812560081482, - "learning_rate": 7.545174605371403e-07, - "loss": 0.0037, - "step": 1217 - }, - { - "epoch": 7.47239263803681, - "grad_norm": 1.0817372798919678, - "learning_rate": 7.510711705294782e-07, - "loss": 0.0064, - "step": 1218 - }, - { - "epoch": 7.478527607361963, - "grad_norm": 1.0459797382354736, - "learning_rate": 7.476313772775578e-07, - "loss": 0.0055, - "step": 1219 - }, - { - "epoch": 7.484662576687117, - "grad_norm": 1.4481663703918457, - "learning_rate": 7.441980935591986e-07, - "loss": 0.0049, - "step": 1220 - }, - { - "epoch": 7.49079754601227, - "grad_norm": 1.7337101697921753, - "learning_rate": 7.407713321280377e-07, - "loss": 0.0123, - "step": 1221 - }, - { - "epoch": 7.4969325153374236, - "grad_norm": 1.3378303050994873, - "learning_rate": 7.373511057134855e-07, - "loss": 0.0056, - "step": 1222 - }, - { - "epoch": 7.5030674846625764, - "grad_norm": 2.4353835582733154, - "learning_rate": 7.339374270206772e-07, - "loss": 0.0155, - "step": 1223 - }, - { - "epoch": 7.50920245398773, - "grad_norm": 2.2856571674346924, - "learning_rate": 7.305303087304227e-07, - "loss": 0.0303, - "step": 1224 - }, - { - "epoch": 7.515337423312883, - "grad_norm": 1.0627055168151855, - "learning_rate": 7.271297634991651e-07, - "loss": 0.0018, - "step": 1225 - }, - { - "epoch": 7.521472392638037, - "grad_norm": 1.2120238542556763, - "learning_rate": 7.237358039589271e-07, - "loss": 0.0064, - "step": 1226 - }, - { - "epoch": 7.52760736196319, - "grad_norm": 1.1861765384674072, - "learning_rate": 7.203484427172702e-07, - "loss": 0.0025, - "step": 1227 - }, - { - "epoch": 7.533742331288344, - "grad_norm": 1.6700332164764404, - "learning_rate": 7.169676923572447e-07, - "loss": 0.0067, - "step": 1228 - }, - { - "epoch": 7.539877300613497, - "grad_norm": 1.4527982473373413, - "learning_rate": 7.135935654373416e-07, - "loss": 0.0082, - "step": 1229 - }, - { - "epoch": 7.54601226993865, - "grad_norm": 1.1425046920776367, - "learning_rate": 7.102260744914499e-07, - "loss": 0.0042, - "step": 1230 - }, - { - "epoch": 7.552147239263804, - "grad_norm": 2.0762295722961426, - "learning_rate": 7.068652320288081e-07, - "loss": 0.0374, - "step": 1231 - }, - { - "epoch": 7.558282208588957, - "grad_norm": 1.2008321285247803, - "learning_rate": 7.035110505339546e-07, - "loss": 0.0022, - "step": 1232 - }, - { - "epoch": 7.564417177914111, - "grad_norm": 1.262100338935852, - "learning_rate": 7.001635424666878e-07, - "loss": 0.006, - "step": 1233 - }, - { - "epoch": 7.570552147239264, - "grad_norm": 1.8173811435699463, - "learning_rate": 6.968227202620137e-07, - "loss": 0.0137, - "step": 1234 - }, - { - "epoch": 7.576687116564417, - "grad_norm": 1.6977999210357666, - "learning_rate": 6.934885963301033e-07, - "loss": 0.0216, - "step": 1235 - }, - { - "epoch": 7.58282208588957, - "grad_norm": 0.7084318399429321, - "learning_rate": 6.901611830562469e-07, - "loss": 0.0027, - "step": 1236 - }, - { - "epoch": 7.588957055214724, - "grad_norm": 2.0332374572753906, - "learning_rate": 6.868404928008035e-07, - "loss": 0.0391, - "step": 1237 - }, - { - "epoch": 7.595092024539877, - "grad_norm": 1.235734224319458, - "learning_rate": 6.835265378991613e-07, - "loss": 0.0053, - "step": 1238 - }, - { - "epoch": 7.601226993865031, - "grad_norm": 2.687920331954956, - "learning_rate": 6.802193306616858e-07, - "loss": 0.0395, - "step": 1239 - }, - { - "epoch": 7.6073619631901845, - "grad_norm": 1.4211101531982422, - "learning_rate": 6.769188833736781e-07, - "loss": 0.0055, - "step": 1240 - }, - { - "epoch": 7.613496932515337, - "grad_norm": 2.4542644023895264, - "learning_rate": 6.736252082953307e-07, - "loss": 0.0072, - "step": 1241 - }, - { - "epoch": 7.61963190184049, - "grad_norm": 1.2946943044662476, - "learning_rate": 6.703383176616743e-07, - "loss": 0.0046, - "step": 1242 - }, - { - "epoch": 7.625766871165644, - "grad_norm": 3.8073277473449707, - "learning_rate": 6.670582236825421e-07, - "loss": 0.0742, - "step": 1243 - }, - { - "epoch": 7.631901840490798, - "grad_norm": 1.4291348457336426, - "learning_rate": 6.637849385425157e-07, - "loss": 0.0069, - "step": 1244 - }, - { - "epoch": 7.638036809815951, - "grad_norm": 1.1767655611038208, - "learning_rate": 6.605184744008866e-07, - "loss": 0.0031, - "step": 1245 - }, - { - "epoch": 7.644171779141105, - "grad_norm": 1.837077260017395, - "learning_rate": 6.572588433916082e-07, - "loss": 0.0316, - "step": 1246 - }, - { - "epoch": 7.6503067484662575, - "grad_norm": 1.9157041311264038, - "learning_rate": 6.540060576232488e-07, - "loss": 0.0472, - "step": 1247 - }, - { - "epoch": 7.656441717791411, - "grad_norm": 1.7347630262374878, - "learning_rate": 6.507601291789515e-07, - "loss": 0.0059, - "step": 1248 - }, - { - "epoch": 7.662576687116564, - "grad_norm": 0.9757588505744934, - "learning_rate": 6.475210701163828e-07, - "loss": 0.0023, - "step": 1249 - }, - { - "epoch": 7.668711656441718, - "grad_norm": 1.9460281133651733, - "learning_rate": 6.442888924676951e-07, - "loss": 0.0207, - "step": 1250 - }, - { - "epoch": 7.674846625766871, - "grad_norm": 0.7517938613891602, - "learning_rate": 6.410636082394772e-07, - "loss": 0.002, - "step": 1251 - }, - { - "epoch": 7.680981595092025, - "grad_norm": 1.0631566047668457, - "learning_rate": 6.378452294127091e-07, - "loss": 0.0038, - "step": 1252 - }, - { - "epoch": 7.6871165644171775, - "grad_norm": 0.9524463415145874, - "learning_rate": 6.346337679427214e-07, - "loss": 0.0024, - "step": 1253 - }, - { - "epoch": 7.693251533742331, - "grad_norm": 1.3653123378753662, - "learning_rate": 6.314292357591489e-07, - "loss": 0.0027, - "step": 1254 - }, - { - "epoch": 7.699386503067485, - "grad_norm": 1.2446377277374268, - "learning_rate": 6.282316447658837e-07, - "loss": 0.0048, - "step": 1255 - }, - { - "epoch": 7.705521472392638, - "grad_norm": 1.716244101524353, - "learning_rate": 6.250410068410367e-07, - "loss": 0.0064, - "step": 1256 - }, - { - "epoch": 7.711656441717792, - "grad_norm": 1.7151219844818115, - "learning_rate": 6.218573338368869e-07, - "loss": 0.0056, - "step": 1257 - }, - { - "epoch": 7.717791411042945, - "grad_norm": 1.8013248443603516, - "learning_rate": 6.186806375798429e-07, - "loss": 0.0073, - "step": 1258 - }, - { - "epoch": 7.723926380368098, - "grad_norm": 1.051620602607727, - "learning_rate": 6.155109298703968e-07, - "loss": 0.0043, - "step": 1259 - }, - { - "epoch": 7.730061349693251, - "grad_norm": 1.5731337070465088, - "learning_rate": 6.123482224830787e-07, - "loss": 0.0108, - "step": 1260 - }, - { - "epoch": 7.736196319018405, - "grad_norm": 2.232144832611084, - "learning_rate": 6.091925271664156e-07, - "loss": 0.0337, - "step": 1261 - }, - { - "epoch": 7.742331288343558, - "grad_norm": 1.072678565979004, - "learning_rate": 6.060438556428877e-07, - "loss": 0.0019, - "step": 1262 - }, - { - "epoch": 7.748466257668712, - "grad_norm": 2.3631110191345215, - "learning_rate": 6.02902219608881e-07, - "loss": 0.0089, - "step": 1263 - }, - { - "epoch": 7.754601226993865, - "grad_norm": 1.1171438694000244, - "learning_rate": 5.997676307346504e-07, - "loss": 0.0045, - "step": 1264 - }, - { - "epoch": 7.7607361963190185, - "grad_norm": 0.7839979529380798, - "learning_rate": 5.966401006642689e-07, - "loss": 0.0028, - "step": 1265 - }, - { - "epoch": 7.766871165644172, - "grad_norm": 1.5938968658447266, - "learning_rate": 5.93519641015591e-07, - "loss": 0.009, - "step": 1266 - }, - { - "epoch": 7.773006134969325, - "grad_norm": 1.2980104684829712, - "learning_rate": 5.904062633802066e-07, - "loss": 0.0168, - "step": 1267 - }, - { - "epoch": 7.779141104294479, - "grad_norm": 1.177626371383667, - "learning_rate": 5.872999793233952e-07, - "loss": 0.0029, - "step": 1268 - }, - { - "epoch": 7.785276073619632, - "grad_norm": 2.0138931274414062, - "learning_rate": 5.842008003840891e-07, - "loss": 0.015, - "step": 1269 - }, - { - "epoch": 7.791411042944786, - "grad_norm": 1.7204387187957764, - "learning_rate": 5.811087380748245e-07, - "loss": 0.011, - "step": 1270 - }, - { - "epoch": 7.7975460122699385, - "grad_norm": 1.506241798400879, - "learning_rate": 5.780238038817035e-07, - "loss": 0.0057, - "step": 1271 - }, - { - "epoch": 7.803680981595092, - "grad_norm": 2.0950393676757812, - "learning_rate": 5.74946009264348e-07, - "loss": 0.0131, - "step": 1272 - }, - { - "epoch": 7.809815950920245, - "grad_norm": 2.1451432704925537, - "learning_rate": 5.71875365655859e-07, - "loss": 0.0088, - "step": 1273 - }, - { - "epoch": 7.815950920245399, - "grad_norm": 0.9690236449241638, - "learning_rate": 5.688118844627746e-07, - "loss": 0.0033, - "step": 1274 - }, - { - "epoch": 7.822085889570552, - "grad_norm": 1.5690608024597168, - "learning_rate": 5.657555770650241e-07, - "loss": 0.0206, - "step": 1275 - }, - { - "epoch": 7.828220858895706, - "grad_norm": 1.8220988512039185, - "learning_rate": 5.627064548158903e-07, - "loss": 0.0096, - "step": 1276 - }, - { - "epoch": 7.8343558282208585, - "grad_norm": 2.3800559043884277, - "learning_rate": 5.596645290419653e-07, - "loss": 0.008, - "step": 1277 - }, - { - "epoch": 7.840490797546012, - "grad_norm": 0.7775714993476868, - "learning_rate": 5.566298110431068e-07, - "loss": 0.0016, - "step": 1278 - }, - { - "epoch": 7.846625766871165, - "grad_norm": 1.1196876764297485, - "learning_rate": 5.536023120924e-07, - "loss": 0.0033, - "step": 1279 - }, - { - "epoch": 7.852760736196319, - "grad_norm": 1.3722344636917114, - "learning_rate": 5.505820434361108e-07, - "loss": 0.0084, - "step": 1280 - }, - { - "epoch": 7.858895705521473, - "grad_norm": 1.2068676948547363, - "learning_rate": 5.47569016293649e-07, - "loss": 0.0049, - "step": 1281 - }, - { - "epoch": 7.865030674846626, - "grad_norm": 1.096085548400879, - "learning_rate": 5.445632418575239e-07, - "loss": 0.0019, - "step": 1282 - }, - { - "epoch": 7.871165644171779, - "grad_norm": 1.3178106546401978, - "learning_rate": 5.415647312933015e-07, - "loss": 0.0062, - "step": 1283 - }, - { - "epoch": 7.877300613496932, - "grad_norm": 1.2884724140167236, - "learning_rate": 5.385734957395664e-07, - "loss": 0.0081, - "step": 1284 - }, - { - "epoch": 7.883435582822086, - "grad_norm": 0.9866589307785034, - "learning_rate": 5.355895463078789e-07, - "loss": 0.0048, - "step": 1285 - }, - { - "epoch": 7.889570552147239, - "grad_norm": 1.5396437644958496, - "learning_rate": 5.326128940827313e-07, - "loss": 0.0088, - "step": 1286 - }, - { - "epoch": 7.895705521472393, - "grad_norm": 1.1183607578277588, - "learning_rate": 5.296435501215116e-07, - "loss": 0.0043, - "step": 1287 - }, - { - "epoch": 7.901840490797546, - "grad_norm": 1.5337073802947998, - "learning_rate": 5.266815254544572e-07, - "loss": 0.0099, - "step": 1288 - }, - { - "epoch": 7.9079754601226995, - "grad_norm": 1.8188867568969727, - "learning_rate": 5.237268310846183e-07, - "loss": 0.0086, - "step": 1289 - }, - { - "epoch": 7.914110429447852, - "grad_norm": 1.972072720527649, - "learning_rate": 5.207794779878156e-07, - "loss": 0.0442, - "step": 1290 - }, - { - "epoch": 7.920245398773006, - "grad_norm": 1.1226261854171753, - "learning_rate": 5.178394771125969e-07, - "loss": 0.0071, - "step": 1291 - }, - { - "epoch": 7.92638036809816, - "grad_norm": 1.5612869262695312, - "learning_rate": 5.149068393802009e-07, - "loss": 0.0192, - "step": 1292 - }, - { - "epoch": 7.932515337423313, - "grad_norm": 1.1532280445098877, - "learning_rate": 5.119815756845123e-07, - "loss": 0.0032, - "step": 1293 - }, - { - "epoch": 7.938650306748467, - "grad_norm": 1.8807255029678345, - "learning_rate": 5.090636968920252e-07, - "loss": 0.0139, - "step": 1294 - }, - { - "epoch": 7.9447852760736195, - "grad_norm": 1.3027002811431885, - "learning_rate": 5.061532138418013e-07, - "loss": 0.0071, - "step": 1295 - }, - { - "epoch": 7.950920245398773, - "grad_norm": 1.584154486656189, - "learning_rate": 5.032501373454266e-07, - "loss": 0.0056, - "step": 1296 - }, - { - "epoch": 7.957055214723926, - "grad_norm": 1.7631733417510986, - "learning_rate": 5.003544781869762e-07, - "loss": 0.0239, - "step": 1297 - }, - { - "epoch": 7.96319018404908, - "grad_norm": 1.9462637901306152, - "learning_rate": 4.974662471229727e-07, - "loss": 0.0336, - "step": 1298 - }, - { - "epoch": 7.969325153374233, - "grad_norm": 1.9697695970535278, - "learning_rate": 4.945854548823425e-07, - "loss": 0.0049, - "step": 1299 - }, - { - "epoch": 7.975460122699387, - "grad_norm": 1.066036581993103, - "learning_rate": 4.917121121663823e-07, - "loss": 0.0103, - "step": 1300 - }, - { - "epoch": 7.9815950920245395, - "grad_norm": 1.0865890979766846, - "learning_rate": 4.888462296487129e-07, - "loss": 0.0036, - "step": 1301 - }, - { - "epoch": 7.987730061349693, - "grad_norm": 1.7804820537567139, - "learning_rate": 4.859878179752448e-07, - "loss": 0.0119, - "step": 1302 - }, - { - "epoch": 7.993865030674847, - "grad_norm": 2.735875129699707, - "learning_rate": 4.83136887764136e-07, - "loss": 0.0365, - "step": 1303 - }, - { - "epoch": 8.0, - "grad_norm": 1.316243290901184, - "learning_rate": 4.802934496057527e-07, - "loss": 0.0046, - "step": 1304 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.229804144130785e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-1467/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00001-of-00007.safetensors deleted file mode 100644 index 7e029aa5ded1a3d7eb6ca9e6cfdc9c487847d20b..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4890d2a1cc96b0ca1be39953a87720daf167230c1ed8afafc4c2d81fb6b1eeb0 -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00002-of-00007.safetensors deleted file mode 100644 index f9ac57a7052ee1e7cc611671ca1813aa4d0ffd64..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a83c0c6f820e66a3d7bb9c758f990ea05815b8cd89d75da00b912421865fb687 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00003-of-00007.safetensors deleted file mode 100644 index 6049018463b719b342e33a561983b281f6d61c7c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:803c57b693ed9f79ce92d4f1a9b18918ba196f454044d53fe77939e0743fc6c7 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00004-of-00007.safetensors deleted file mode 100644 index 587b1816e50b6ee5c610ebcb0be5d42cdae5afa5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:02969a3b979f32bf5a6ffdbe1d845bca930077a6b49fac1d228e5c8a6a698f0c -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00005-of-00007.safetensors deleted file mode 100644 index cd31c5f58f0c76bfdacfc8ed96f038c90937b40a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:65da0257e31df1fa401086d27032e8824a577393c199060cfc31a7ca135e027a -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00006-of-00007.safetensors deleted file mode 100644 index 9216f3218daa1c92e82e4ceb3f4a8f2482aa2994..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a52381e88aa042391ba2b3257c8d596e2edf36917fed37432ce1e0afab4ec27f -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00007-of-00007.safetensors deleted file mode 100644 index 5375a8712755dcb6c10b7fd52cd20a2c097afe30..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ffafffec8ed3206924973080e66a5b21730a20c4dc2c00a4d97513d77ba0c629 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_0.pth deleted file mode 100644 index c52ec8f5d66c6a990609422386c047d0c3ed3970..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:543ef05f530d40ee20b8d626b07a69b86597aca643e48897571062f973efe84f -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_1.pth deleted file mode 100644 index 7e4ae755d2c391c6486028b2ab09f40e1e5b6b3f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a23f732e43838ce0398d2636885ac16badbb9bcbc04d1406069ba3027bc5ae0 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_2.pth deleted file mode 100644 index 47425e0477082be97b4d8dda14c0159e7914ebb0..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e10cce960e7068b051c05e35ed6160656be9091c63f13796ac2ed7e9c84e5a72 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_3.pth deleted file mode 100644 index adaf9621fc3ca0a14f99862b58c3bebc5b7168e3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6f6049e212b1df5cefc5d834afcd8cc052c73f1457449e9fe8a38d514f54078 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-1467/scheduler.pt deleted file mode 100644 index c70f44dde2ffa72b0f171c29f4e715507baa0c46..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:463242075713d9e5e13e2488eb97423539bb1ee0748d3dcc34e543121edf3016 -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1467/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-1467/trainer_state.json deleted file mode 100644 index cd228e9c02575a49094195fb80a59a687dac0074..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1467/trainer_state.json +++ /dev/null @@ -1,10303 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 9.0, - "eval_steps": 500, - "global_step": 1467, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - }, - { - "epoch": 3.0061349693251533, - "grad_norm": 5.331607818603516, - "learning_rate": 3.969463130731183e-06, - "loss": 0.3267, - "step": 490 - }, - { - "epoch": 3.0122699386503067, - "grad_norm": 3.453650712966919, - "learning_rate": 3.965562241492401e-06, - "loss": 0.2719, - "step": 491 - }, - { - "epoch": 3.01840490797546, - "grad_norm": 3.232313632965088, - "learning_rate": 3.9616559081213335e-06, - "loss": 0.1825, - "step": 492 - }, - { - "epoch": 3.0245398773006134, - "grad_norm": 3.4860260486602783, - "learning_rate": 3.957744145128858e-06, - "loss": 0.1854, - "step": 493 - }, - { - "epoch": 3.0306748466257667, - "grad_norm": 3.4357805252075195, - "learning_rate": 3.953826967046021e-06, - "loss": 0.2224, - "step": 494 - }, - { - "epoch": 3.03680981595092, - "grad_norm": 4.557503700256348, - "learning_rate": 3.9499043884239894e-06, - "loss": 0.349, - "step": 495 - }, - { - "epoch": 3.042944785276074, - "grad_norm": 4.685214042663574, - "learning_rate": 3.945976423833987e-06, - "loss": 0.175, - "step": 496 - }, - { - "epoch": 3.049079754601227, - "grad_norm": 3.7430171966552734, - "learning_rate": 3.942043087867244e-06, - "loss": 0.2773, - "step": 497 - }, - { - "epoch": 3.0552147239263805, - "grad_norm": 3.756450653076172, - "learning_rate": 3.938104395134947e-06, - "loss": 0.4445, - "step": 498 - }, - { - "epoch": 3.061349693251534, - "grad_norm": 4.049175262451172, - "learning_rate": 3.9341603602681805e-06, - "loss": 0.3046, - "step": 499 - }, - { - "epoch": 3.067484662576687, - "grad_norm": 3.7689461708068848, - "learning_rate": 3.930210997917871e-06, - "loss": 0.2544, - "step": 500 - }, - { - "epoch": 3.0736196319018405, - "grad_norm": 4.027602195739746, - "learning_rate": 3.92625632275474e-06, - "loss": 0.3154, - "step": 501 - }, - { - "epoch": 3.079754601226994, - "grad_norm": 2.8449292182922363, - "learning_rate": 3.922296349469239e-06, - "loss": 0.2804, - "step": 502 - }, - { - "epoch": 3.085889570552147, - "grad_norm": 2.9555234909057617, - "learning_rate": 3.918331092771505e-06, - "loss": 0.2393, - "step": 503 - }, - { - "epoch": 3.0920245398773005, - "grad_norm": 2.621042013168335, - "learning_rate": 3.914360567391296e-06, - "loss": 0.1403, - "step": 504 - }, - { - "epoch": 3.098159509202454, - "grad_norm": 3.2348620891571045, - "learning_rate": 3.910384788077949e-06, - "loss": 0.1537, - "step": 505 - }, - { - "epoch": 3.104294478527607, - "grad_norm": 3.030179977416992, - "learning_rate": 3.906403769600311e-06, - "loss": 0.2921, - "step": 506 - }, - { - "epoch": 3.1104294478527605, - "grad_norm": 3.146428346633911, - "learning_rate": 3.902417526746694e-06, - "loss": 0.2036, - "step": 507 - }, - { - "epoch": 3.116564417177914, - "grad_norm": 3.6201512813568115, - "learning_rate": 3.898426074324818e-06, - "loss": 0.2655, - "step": 508 - }, - { - "epoch": 3.1226993865030677, - "grad_norm": 3.7674012184143066, - "learning_rate": 3.8944294271617524e-06, - "loss": 0.3938, - "step": 509 - }, - { - "epoch": 3.128834355828221, - "grad_norm": 4.54722785949707, - "learning_rate": 3.890427600103865e-06, - "loss": 0.3051, - "step": 510 - }, - { - "epoch": 3.1349693251533743, - "grad_norm": 4.228236675262451, - "learning_rate": 3.886420608016767e-06, - "loss": 0.3719, - "step": 511 - }, - { - "epoch": 3.1411042944785277, - "grad_norm": 4.355110168457031, - "learning_rate": 3.882408465785252e-06, - "loss": 0.1863, - "step": 512 - }, - { - "epoch": 3.147239263803681, - "grad_norm": 3.451460838317871, - "learning_rate": 3.878391188313249e-06, - "loss": 0.1479, - "step": 513 - }, - { - "epoch": 3.1533742331288344, - "grad_norm": 4.395524501800537, - "learning_rate": 3.87436879052376e-06, - "loss": 0.238, - "step": 514 - }, - { - "epoch": 3.1595092024539877, - "grad_norm": 2.940717935562134, - "learning_rate": 3.870341287358809e-06, - "loss": 0.2069, - "step": 515 - }, - { - "epoch": 3.165644171779141, - "grad_norm": 2.5817320346832275, - "learning_rate": 3.8663086937793845e-06, - "loss": 0.1189, - "step": 516 - }, - { - "epoch": 3.1717791411042944, - "grad_norm": 3.9863343238830566, - "learning_rate": 3.862271024765385e-06, - "loss": 0.3434, - "step": 517 - }, - { - "epoch": 3.1779141104294477, - "grad_norm": 3.609004259109497, - "learning_rate": 3.8582282953155626e-06, - "loss": 0.1602, - "step": 518 - }, - { - "epoch": 3.184049079754601, - "grad_norm": 3.207533121109009, - "learning_rate": 3.854180520447465e-06, - "loss": 0.3452, - "step": 519 - }, - { - "epoch": 3.190184049079755, - "grad_norm": 3.593388795852661, - "learning_rate": 3.850127715197387e-06, - "loss": 0.2832, - "step": 520 - }, - { - "epoch": 3.196319018404908, - "grad_norm": 3.409064531326294, - "learning_rate": 3.846069894620306e-06, - "loss": 0.1481, - "step": 521 - }, - { - "epoch": 3.2024539877300615, - "grad_norm": 3.461498737335205, - "learning_rate": 3.84200707378983e-06, - "loss": 0.1283, - "step": 522 - }, - { - "epoch": 3.208588957055215, - "grad_norm": 3.708467483520508, - "learning_rate": 3.8379392677981434e-06, - "loss": 0.2468, - "step": 523 - }, - { - "epoch": 3.214723926380368, - "grad_norm": 2.802381753921509, - "learning_rate": 3.833866491755947e-06, - "loss": 0.2685, - "step": 524 - }, - { - "epoch": 3.2208588957055215, - "grad_norm": 3.0787744522094727, - "learning_rate": 3.8297887607924044e-06, - "loss": 0.2595, - "step": 525 - }, - { - "epoch": 3.226993865030675, - "grad_norm": 3.3952548503875732, - "learning_rate": 3.825706090055088e-06, - "loss": 0.4099, - "step": 526 - }, - { - "epoch": 3.233128834355828, - "grad_norm": 3.3497085571289062, - "learning_rate": 3.821618494709916e-06, - "loss": 0.287, - "step": 527 - }, - { - "epoch": 3.2392638036809815, - "grad_norm": 4.050611972808838, - "learning_rate": 3.817525989941102e-06, - "loss": 0.2369, - "step": 528 - }, - { - "epoch": 3.245398773006135, - "grad_norm": 2.87642240524292, - "learning_rate": 3.8134285909510972e-06, - "loss": 0.2751, - "step": 529 - }, - { - "epoch": 3.2515337423312882, - "grad_norm": 3.821941614151001, - "learning_rate": 3.8093263129605305e-06, - "loss": 0.2363, - "step": 530 - }, - { - "epoch": 3.2576687116564416, - "grad_norm": 2.8066117763519287, - "learning_rate": 3.80521917120816e-06, - "loss": 0.094, - "step": 531 - }, - { - "epoch": 3.263803680981595, - "grad_norm": 3.849768877029419, - "learning_rate": 3.801107180950806e-06, - "loss": 0.4117, - "step": 532 - }, - { - "epoch": 3.2699386503067487, - "grad_norm": 2.4161250591278076, - "learning_rate": 3.7969903574633028e-06, - "loss": 0.1183, - "step": 533 - }, - { - "epoch": 3.276073619631902, - "grad_norm": 3.6743111610412598, - "learning_rate": 3.792868716038437e-06, - "loss": 0.2296, - "step": 534 - }, - { - "epoch": 3.2822085889570554, - "grad_norm": 4.378123760223389, - "learning_rate": 3.7887422719868937e-06, - "loss": 0.2678, - "step": 535 - }, - { - "epoch": 3.2883435582822087, - "grad_norm": 4.816481590270996, - "learning_rate": 3.784611040637198e-06, - "loss": 0.4887, - "step": 536 - }, - { - "epoch": 3.294478527607362, - "grad_norm": 3.5712430477142334, - "learning_rate": 3.7804750373356576e-06, - "loss": 0.3827, - "step": 537 - }, - { - "epoch": 3.3006134969325154, - "grad_norm": 3.6877355575561523, - "learning_rate": 3.776334277446307e-06, - "loss": 0.3233, - "step": 538 - }, - { - "epoch": 3.3067484662576687, - "grad_norm": 3.442706346511841, - "learning_rate": 3.7721887763508512e-06, - "loss": 0.1256, - "step": 539 - }, - { - "epoch": 3.312883435582822, - "grad_norm": 3.9265615940093994, - "learning_rate": 3.7680385494486053e-06, - "loss": 0.3845, - "step": 540 - }, - { - "epoch": 3.3190184049079754, - "grad_norm": 3.5030126571655273, - "learning_rate": 3.7638836121564414e-06, - "loss": 0.2905, - "step": 541 - }, - { - "epoch": 3.3251533742331287, - "grad_norm": 3.6685378551483154, - "learning_rate": 3.7597239799087283e-06, - "loss": 0.3561, - "step": 542 - }, - { - "epoch": 3.331288343558282, - "grad_norm": 3.8484046459198, - "learning_rate": 3.7555596681572736e-06, - "loss": 0.1157, - "step": 543 - }, - { - "epoch": 3.3374233128834354, - "grad_norm": 3.7977402210235596, - "learning_rate": 3.751390692371272e-06, - "loss": 0.3049, - "step": 544 - }, - { - "epoch": 3.3435582822085887, - "grad_norm": 3.4409852027893066, - "learning_rate": 3.7472170680372398e-06, - "loss": 0.1626, - "step": 545 - }, - { - "epoch": 3.3496932515337425, - "grad_norm": 3.801541328430176, - "learning_rate": 3.7430388106589632e-06, - "loss": 0.2414, - "step": 546 - }, - { - "epoch": 3.355828220858896, - "grad_norm": 4.025203704833984, - "learning_rate": 3.738855935757438e-06, - "loss": 0.3441, - "step": 547 - }, - { - "epoch": 3.361963190184049, - "grad_norm": 4.242798805236816, - "learning_rate": 3.7346684588708135e-06, - "loss": 0.5244, - "step": 548 - }, - { - "epoch": 3.3680981595092025, - "grad_norm": 3.0516819953918457, - "learning_rate": 3.7304763955543332e-06, - "loss": 0.1984, - "step": 549 - }, - { - "epoch": 3.374233128834356, - "grad_norm": 3.894667625427246, - "learning_rate": 3.726279761380279e-06, - "loss": 0.2715, - "step": 550 - }, - { - "epoch": 3.3803680981595092, - "grad_norm": 3.171208143234253, - "learning_rate": 3.72207857193791e-06, - "loss": 0.1537, - "step": 551 - }, - { - "epoch": 3.3865030674846626, - "grad_norm": 4.344860553741455, - "learning_rate": 3.7178728428334092e-06, - "loss": 0.2388, - "step": 552 - }, - { - "epoch": 3.392638036809816, - "grad_norm": 2.766317367553711, - "learning_rate": 3.7136625896898226e-06, - "loss": 0.1726, - "step": 553 - }, - { - "epoch": 3.3987730061349692, - "grad_norm": 3.550662040710449, - "learning_rate": 3.7094478281470003e-06, - "loss": 0.2942, - "step": 554 - }, - { - "epoch": 3.4049079754601226, - "grad_norm": 3.4576945304870605, - "learning_rate": 3.7052285738615412e-06, - "loss": 0.1665, - "step": 555 - }, - { - "epoch": 3.411042944785276, - "grad_norm": 4.026793003082275, - "learning_rate": 3.7010048425067317e-06, - "loss": 0.3954, - "step": 556 - }, - { - "epoch": 3.4171779141104293, - "grad_norm": 4.600133419036865, - "learning_rate": 3.696776649772492e-06, - "loss": 0.3207, - "step": 557 - }, - { - "epoch": 3.4233128834355826, - "grad_norm": 4.747331142425537, - "learning_rate": 3.692544011365312e-06, - "loss": 0.1325, - "step": 558 - }, - { - "epoch": 3.4294478527607364, - "grad_norm": 3.781464099884033, - "learning_rate": 3.6883069430081986e-06, - "loss": 0.1644, - "step": 559 - }, - { - "epoch": 3.4355828220858897, - "grad_norm": 2.905986785888672, - "learning_rate": 3.6840654604406135e-06, - "loss": 0.2469, - "step": 560 - }, - { - "epoch": 3.441717791411043, - "grad_norm": 2.3747711181640625, - "learning_rate": 3.679819579418414e-06, - "loss": 0.1146, - "step": 561 - }, - { - "epoch": 3.4478527607361964, - "grad_norm": 3.2683632373809814, - "learning_rate": 3.6755693157137995e-06, - "loss": 0.3236, - "step": 562 - }, - { - "epoch": 3.4539877300613497, - "grad_norm": 3.7750496864318848, - "learning_rate": 3.6713146851152487e-06, - "loss": 0.399, - "step": 563 - }, - { - "epoch": 3.460122699386503, - "grad_norm": 3.3912384510040283, - "learning_rate": 3.667055703427461e-06, - "loss": 0.1259, - "step": 564 - }, - { - "epoch": 3.4662576687116564, - "grad_norm": 3.0224430561065674, - "learning_rate": 3.6627923864713e-06, - "loss": 0.1835, - "step": 565 - }, - { - "epoch": 3.4723926380368098, - "grad_norm": 3.642258405685425, - "learning_rate": 3.658524750083733e-06, - "loss": 0.2763, - "step": 566 - }, - { - "epoch": 3.478527607361963, - "grad_norm": 3.409890651702881, - "learning_rate": 3.654252810117773e-06, - "loss": 0.2496, - "step": 567 - }, - { - "epoch": 3.4846625766871164, - "grad_norm": 3.0416476726531982, - "learning_rate": 3.6499765824424195e-06, - "loss": 0.1287, - "step": 568 - }, - { - "epoch": 3.4907975460122698, - "grad_norm": 3.1963987350463867, - "learning_rate": 3.6456960829425987e-06, - "loss": 0.1747, - "step": 569 - }, - { - "epoch": 3.4969325153374236, - "grad_norm": 3.198448657989502, - "learning_rate": 3.641411327519107e-06, - "loss": 0.1913, - "step": 570 - }, - { - "epoch": 3.5030674846625764, - "grad_norm": 3.7023441791534424, - "learning_rate": 3.6371223320885492e-06, - "loss": 0.3224, - "step": 571 - }, - { - "epoch": 3.5092024539877302, - "grad_norm": 4.54288387298584, - "learning_rate": 3.6328291125832803e-06, - "loss": 0.2364, - "step": 572 - }, - { - "epoch": 3.5153374233128836, - "grad_norm": 3.5064890384674072, - "learning_rate": 3.628531684951347e-06, - "loss": 0.2552, - "step": 573 - }, - { - "epoch": 3.521472392638037, - "grad_norm": 3.987583875656128, - "learning_rate": 3.6242300651564276e-06, - "loss": 0.3232, - "step": 574 - }, - { - "epoch": 3.5276073619631902, - "grad_norm": 3.179642915725708, - "learning_rate": 3.6199242691777745e-06, - "loss": 0.32, - "step": 575 - }, - { - "epoch": 3.5337423312883436, - "grad_norm": 3.3078157901763916, - "learning_rate": 3.6156143130101516e-06, - "loss": 0.2922, - "step": 576 - }, - { - "epoch": 3.539877300613497, - "grad_norm": 3.1628613471984863, - "learning_rate": 3.6113002126637765e-06, - "loss": 0.2005, - "step": 577 - }, - { - "epoch": 3.5460122699386503, - "grad_norm": 3.4515540599823, - "learning_rate": 3.606981984164263e-06, - "loss": 0.2138, - "step": 578 - }, - { - "epoch": 3.5521472392638036, - "grad_norm": 5.132473945617676, - "learning_rate": 3.6026596435525578e-06, - "loss": 0.4382, - "step": 579 - }, - { - "epoch": 3.558282208588957, - "grad_norm": 3.397614002227783, - "learning_rate": 3.5983332068848855e-06, - "loss": 0.3326, - "step": 580 - }, - { - "epoch": 3.5644171779141103, - "grad_norm": 4.79497766494751, - "learning_rate": 3.5940026902326825e-06, - "loss": 0.4748, - "step": 581 - }, - { - "epoch": 3.5705521472392636, - "grad_norm": 3.7675018310546875, - "learning_rate": 3.5896681096825446e-06, - "loss": 0.2692, - "step": 582 - }, - { - "epoch": 3.5766871165644174, - "grad_norm": 3.0637521743774414, - "learning_rate": 3.5853294813361614e-06, - "loss": 0.3658, - "step": 583 - }, - { - "epoch": 3.5828220858895703, - "grad_norm": 2.8949790000915527, - "learning_rate": 3.5809868213102623e-06, - "loss": 0.1661, - "step": 584 - }, - { - "epoch": 3.588957055214724, - "grad_norm": 3.163419246673584, - "learning_rate": 3.5766401457365485e-06, - "loss": 0.1233, - "step": 585 - }, - { - "epoch": 3.5950920245398774, - "grad_norm": 3.1787965297698975, - "learning_rate": 3.5722894707616417e-06, - "loss": 0.278, - "step": 586 - }, - { - "epoch": 3.6012269938650308, - "grad_norm": 2.9397857189178467, - "learning_rate": 3.5679348125470175e-06, - "loss": 0.1541, - "step": 587 - }, - { - "epoch": 3.607361963190184, - "grad_norm": 3.2690396308898926, - "learning_rate": 3.56357618726895e-06, - "loss": 0.1575, - "step": 588 - }, - { - "epoch": 3.6134969325153374, - "grad_norm": 5.444014072418213, - "learning_rate": 3.5592136111184483e-06, - "loss": 0.8079, - "step": 589 - }, - { - "epoch": 3.6196319018404908, - "grad_norm": 3.1688313484191895, - "learning_rate": 3.554847100301199e-06, - "loss": 0.341, - "step": 590 - }, - { - "epoch": 3.625766871165644, - "grad_norm": 2.469212532043457, - "learning_rate": 3.550476671037505e-06, - "loss": 0.1625, - "step": 591 - }, - { - "epoch": 3.6319018404907975, - "grad_norm": 3.3956527709960938, - "learning_rate": 3.546102339562223e-06, - "loss": 0.199, - "step": 592 - }, - { - "epoch": 3.638036809815951, - "grad_norm": 2.7287702560424805, - "learning_rate": 3.5417241221247078e-06, - "loss": 0.1493, - "step": 593 - }, - { - "epoch": 3.644171779141104, - "grad_norm": 3.5046865940093994, - "learning_rate": 3.5373420349887477e-06, - "loss": 0.2765, - "step": 594 - }, - { - "epoch": 3.6503067484662575, - "grad_norm": 3.121476650238037, - "learning_rate": 3.5329560944325065e-06, - "loss": 0.2833, - "step": 595 - }, - { - "epoch": 3.6564417177914113, - "grad_norm": 3.276463270187378, - "learning_rate": 3.528566316748462e-06, - "loss": 0.1237, - "step": 596 - }, - { - "epoch": 3.662576687116564, - "grad_norm": 3.382840633392334, - "learning_rate": 3.524172718243347e-06, - "loss": 0.1599, - "step": 597 - }, - { - "epoch": 3.668711656441718, - "grad_norm": 4.801311492919922, - "learning_rate": 3.5197753152380854e-06, - "loss": 0.2997, - "step": 598 - }, - { - "epoch": 3.6748466257668713, - "grad_norm": 4.117336273193359, - "learning_rate": 3.515374124067736e-06, - "loss": 0.2021, - "step": 599 - }, - { - "epoch": 3.6809815950920246, - "grad_norm": 3.611438035964966, - "learning_rate": 3.5109691610814263e-06, - "loss": 0.1726, - "step": 600 - }, - { - "epoch": 3.687116564417178, - "grad_norm": 4.5179972648620605, - "learning_rate": 3.5065604426422995e-06, - "loss": 0.1377, - "step": 601 - }, - { - "epoch": 3.6932515337423313, - "grad_norm": 3.561061382293701, - "learning_rate": 3.502147985127445e-06, - "loss": 0.1497, - "step": 602 - }, - { - "epoch": 3.6993865030674846, - "grad_norm": 3.3497917652130127, - "learning_rate": 3.4977318049278443e-06, - "loss": 0.1589, - "step": 603 - }, - { - "epoch": 3.705521472392638, - "grad_norm": 3.2725470066070557, - "learning_rate": 3.4933119184483065e-06, - "loss": 0.1364, - "step": 604 - }, - { - "epoch": 3.7116564417177913, - "grad_norm": 3.228956460952759, - "learning_rate": 3.4888883421074076e-06, - "loss": 0.177, - "step": 605 - }, - { - "epoch": 3.7177914110429446, - "grad_norm": 3.7648911476135254, - "learning_rate": 3.484461092337434e-06, - "loss": 0.122, - "step": 606 - }, - { - "epoch": 3.7239263803680984, - "grad_norm": 3.5322585105895996, - "learning_rate": 3.4800301855843137e-06, - "loss": 0.2664, - "step": 607 - }, - { - "epoch": 3.7300613496932513, - "grad_norm": 2.951073169708252, - "learning_rate": 3.4755956383075613e-06, - "loss": 0.12, - "step": 608 - }, - { - "epoch": 3.736196319018405, - "grad_norm": 3.0577664375305176, - "learning_rate": 3.471157466980214e-06, - "loss": 0.3926, - "step": 609 - }, - { - "epoch": 3.7423312883435584, - "grad_norm": 4.089846134185791, - "learning_rate": 3.466715688088772e-06, - "loss": 0.6233, - "step": 610 - }, - { - "epoch": 3.7484662576687118, - "grad_norm": 3.081340789794922, - "learning_rate": 3.462270318133136e-06, - "loss": 0.2456, - "step": 611 - }, - { - "epoch": 3.754601226993865, - "grad_norm": 3.034712553024292, - "learning_rate": 3.4578213736265474e-06, - "loss": 0.2683, - "step": 612 - }, - { - "epoch": 3.7607361963190185, - "grad_norm": 3.459815740585327, - "learning_rate": 3.4533688710955255e-06, - "loss": 0.3796, - "step": 613 - }, - { - "epoch": 3.766871165644172, - "grad_norm": 3.523737907409668, - "learning_rate": 3.448912827079805e-06, - "loss": 0.3326, - "step": 614 - }, - { - "epoch": 3.773006134969325, - "grad_norm": 3.333219289779663, - "learning_rate": 3.4444532581322793e-06, - "loss": 0.206, - "step": 615 - }, - { - "epoch": 3.7791411042944785, - "grad_norm": 3.582387685775757, - "learning_rate": 3.4399901808189327e-06, - "loss": 0.244, - "step": 616 - }, - { - "epoch": 3.785276073619632, - "grad_norm": 3.4887266159057617, - "learning_rate": 3.435523611718785e-06, - "loss": 0.1796, - "step": 617 - }, - { - "epoch": 3.791411042944785, - "grad_norm": 4.89408016204834, - "learning_rate": 3.4310535674238242e-06, - "loss": 0.188, - "step": 618 - }, - { - "epoch": 3.7975460122699385, - "grad_norm": 4.338910102844238, - "learning_rate": 3.42658006453895e-06, - "loss": 0.3039, - "step": 619 - }, - { - "epoch": 3.8036809815950923, - "grad_norm": 4.107708930969238, - "learning_rate": 3.4221031196819083e-06, - "loss": 0.3383, - "step": 620 - }, - { - "epoch": 3.809815950920245, - "grad_norm": 3.698777675628662, - "learning_rate": 3.4176227494832305e-06, - "loss": 0.1721, - "step": 621 - }, - { - "epoch": 3.815950920245399, - "grad_norm": 2.6659226417541504, - "learning_rate": 3.413138970586174e-06, - "loss": 0.2211, - "step": 622 - }, - { - "epoch": 3.8220858895705523, - "grad_norm": 3.2398436069488525, - "learning_rate": 3.4086517996466574e-06, - "loss": 0.1871, - "step": 623 - }, - { - "epoch": 3.8282208588957056, - "grad_norm": 4.9128804206848145, - "learning_rate": 3.404161253333199e-06, - "loss": 0.3874, - "step": 624 - }, - { - "epoch": 3.834355828220859, - "grad_norm": 3.508789300918579, - "learning_rate": 3.3996673483268573e-06, - "loss": 0.1739, - "step": 625 - }, - { - "epoch": 3.8404907975460123, - "grad_norm": 3.3016927242279053, - "learning_rate": 3.3951701013211665e-06, - "loss": 0.274, - "step": 626 - }, - { - "epoch": 3.8466257668711656, - "grad_norm": 3.8941333293914795, - "learning_rate": 3.3906695290220736e-06, - "loss": 0.3568, - "step": 627 - }, - { - "epoch": 3.852760736196319, - "grad_norm": 3.512354850769043, - "learning_rate": 3.3861656481478816e-06, - "loss": 0.157, - "step": 628 - }, - { - "epoch": 3.8588957055214723, - "grad_norm": 3.482649326324463, - "learning_rate": 3.3816584754291814e-06, - "loss": 0.1218, - "step": 629 - }, - { - "epoch": 3.8650306748466257, - "grad_norm": 3.1490275859832764, - "learning_rate": 3.377148027608793e-06, - "loss": 0.2234, - "step": 630 - }, - { - "epoch": 3.871165644171779, - "grad_norm": 3.2172653675079346, - "learning_rate": 3.3726343214417023e-06, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 3.8773006134969323, - "grad_norm": 4.167707443237305, - "learning_rate": 3.3681173736949984e-06, - "loss": 0.1384, - "step": 632 - }, - { - "epoch": 3.883435582822086, - "grad_norm": 3.4743919372558594, - "learning_rate": 3.3635972011478134e-06, - "loss": 0.3807, - "step": 633 - }, - { - "epoch": 3.889570552147239, - "grad_norm": 3.6892173290252686, - "learning_rate": 3.3590738205912566e-06, - "loss": 0.194, - "step": 634 - }, - { - "epoch": 3.895705521472393, - "grad_norm": 3.262967824935913, - "learning_rate": 3.354547248828356e-06, - "loss": 0.202, - "step": 635 - }, - { - "epoch": 3.901840490797546, - "grad_norm": 3.8871562480926514, - "learning_rate": 3.3500175026739916e-06, - "loss": 0.2471, - "step": 636 - }, - { - "epoch": 3.9079754601226995, - "grad_norm": 3.5097084045410156, - "learning_rate": 3.3454845989548385e-06, - "loss": 0.1112, - "step": 637 - }, - { - "epoch": 3.914110429447853, - "grad_norm": 4.163944721221924, - "learning_rate": 3.3409485545092995e-06, - "loss": 0.3368, - "step": 638 - }, - { - "epoch": 3.920245398773006, - "grad_norm": 3.6405045986175537, - "learning_rate": 3.336409386187444e-06, - "loss": 0.1863, - "step": 639 - }, - { - "epoch": 3.9263803680981595, - "grad_norm": 3.2477526664733887, - "learning_rate": 3.331867110850946e-06, - "loss": 0.1491, - "step": 640 - }, - { - "epoch": 3.932515337423313, - "grad_norm": 3.933753490447998, - "learning_rate": 3.327321745373021e-06, - "loss": 0.2484, - "step": 641 - }, - { - "epoch": 3.938650306748466, - "grad_norm": 3.2475059032440186, - "learning_rate": 3.322773306638364e-06, - "loss": 0.2126, - "step": 642 - }, - { - "epoch": 3.9447852760736195, - "grad_norm": 2.628467321395874, - "learning_rate": 3.318221811543086e-06, - "loss": 0.1649, - "step": 643 - }, - { - "epoch": 3.950920245398773, - "grad_norm": 3.2612411975860596, - "learning_rate": 3.313667276994651e-06, - "loss": 0.1442, - "step": 644 - }, - { - "epoch": 3.957055214723926, - "grad_norm": 3.8058395385742188, - "learning_rate": 3.309109719911814e-06, - "loss": 0.359, - "step": 645 - }, - { - "epoch": 3.96319018404908, - "grad_norm": 3.3450071811676025, - "learning_rate": 3.304549157224558e-06, - "loss": 0.4042, - "step": 646 - }, - { - "epoch": 3.969325153374233, - "grad_norm": 3.079601287841797, - "learning_rate": 3.299985605874031e-06, - "loss": 0.1699, - "step": 647 - }, - { - "epoch": 3.9754601226993866, - "grad_norm": 3.8963980674743652, - "learning_rate": 3.295419082812483e-06, - "loss": 0.1888, - "step": 648 - }, - { - "epoch": 3.98159509202454, - "grad_norm": 3.307405948638916, - "learning_rate": 3.2908496050032024e-06, - "loss": 0.2824, - "step": 649 - }, - { - "epoch": 3.9877300613496933, - "grad_norm": 3.227478265762329, - "learning_rate": 3.2862771894204544e-06, - "loss": 0.3038, - "step": 650 - }, - { - "epoch": 3.9938650306748467, - "grad_norm": 4.046506881713867, - "learning_rate": 3.2817018530494164e-06, - "loss": 0.3266, - "step": 651 - }, - { - "epoch": 4.0, - "grad_norm": 7.775874614715576, - "learning_rate": 3.277123612886116e-06, - "loss": 0.2998, - "step": 652 - }, - { - "epoch": 4.006134969325154, - "grad_norm": 3.146462917327881, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2764, - "step": 653 - }, - { - "epoch": 4.012269938650307, - "grad_norm": 3.0539863109588623, - "learning_rate": 3.2679584892207118e-06, - "loss": 0.1157, - "step": 654 - }, - { - "epoch": 4.0184049079754605, - "grad_norm": 3.634021520614624, - "learning_rate": 3.263371639764343e-06, - "loss": 0.0707, - "step": 655 - }, - { - "epoch": 4.024539877300613, - "grad_norm": 3.3474650382995605, - "learning_rate": 3.2587819546070596e-06, - "loss": 0.1067, - "step": 656 - }, - { - "epoch": 4.030674846625767, - "grad_norm": 4.409244537353516, - "learning_rate": 3.254189450798189e-06, - "loss": 0.0564, - "step": 657 - }, - { - "epoch": 4.03680981595092, - "grad_norm": 3.0446252822875977, - "learning_rate": 3.2495941453975312e-06, - "loss": 0.0535, - "step": 658 - }, - { - "epoch": 4.042944785276074, - "grad_norm": 4.014753818511963, - "learning_rate": 3.2449960554752935e-06, - "loss": 0.1245, - "step": 659 - }, - { - "epoch": 4.049079754601227, - "grad_norm": 3.188062906265259, - "learning_rate": 3.240395198112026e-06, - "loss": 0.0626, - "step": 660 - }, - { - "epoch": 4.0552147239263805, - "grad_norm": 3.006086826324463, - "learning_rate": 3.2357915903985605e-06, - "loss": 0.1198, - "step": 661 - }, - { - "epoch": 4.061349693251533, - "grad_norm": 2.8865551948547363, - "learning_rate": 3.2311852494359423e-06, - "loss": 0.0454, - "step": 662 - }, - { - "epoch": 4.067484662576687, - "grad_norm": 4.2888007164001465, - "learning_rate": 3.226576192335373e-06, - "loss": 0.2064, - "step": 663 - }, - { - "epoch": 4.07361963190184, - "grad_norm": 3.1414525508880615, - "learning_rate": 3.2219644362181436e-06, - "loss": 0.2183, - "step": 664 - }, - { - "epoch": 4.079754601226994, - "grad_norm": 2.556277275085449, - "learning_rate": 3.21734999821557e-06, - "loss": 0.0516, - "step": 665 - }, - { - "epoch": 4.085889570552148, - "grad_norm": 2.698118209838867, - "learning_rate": 3.2127328954689307e-06, - "loss": 0.0613, - "step": 666 - }, - { - "epoch": 4.0920245398773005, - "grad_norm": 2.869919538497925, - "learning_rate": 3.2081131451294025e-06, - "loss": 0.0583, - "step": 667 - }, - { - "epoch": 4.098159509202454, - "grad_norm": 3.8786919116973877, - "learning_rate": 3.2034907643579988e-06, - "loss": 0.0766, - "step": 668 - }, - { - "epoch": 4.104294478527607, - "grad_norm": 4.224637031555176, - "learning_rate": 3.1988657703255043e-06, - "loss": 0.1099, - "step": 669 - }, - { - "epoch": 4.110429447852761, - "grad_norm": 4.671669006347656, - "learning_rate": 3.194238180212409e-06, - "loss": 0.1663, - "step": 670 - }, - { - "epoch": 4.116564417177914, - "grad_norm": 3.2484257221221924, - "learning_rate": 3.1896080112088477e-06, - "loss": 0.0587, - "step": 671 - }, - { - "epoch": 4.122699386503068, - "grad_norm": 2.4808075428009033, - "learning_rate": 3.184975280514536e-06, - "loss": 0.0579, - "step": 672 - }, - { - "epoch": 4.128834355828221, - "grad_norm": 3.7106919288635254, - "learning_rate": 3.1803400053387044e-06, - "loss": 0.1083, - "step": 673 - }, - { - "epoch": 4.134969325153374, - "grad_norm": 3.008970260620117, - "learning_rate": 3.175702202900036e-06, - "loss": 0.1355, - "step": 674 - }, - { - "epoch": 4.141104294478527, - "grad_norm": 3.2640793323516846, - "learning_rate": 3.1710618904266006e-06, - "loss": 0.092, - "step": 675 - }, - { - "epoch": 4.147239263803681, - "grad_norm": 3.08042049407959, - "learning_rate": 3.166419085155793e-06, - "loss": 0.0563, - "step": 676 - }, - { - "epoch": 4.153374233128835, - "grad_norm": 2.993530511856079, - "learning_rate": 3.1617738043342695e-06, - "loss": 0.1773, - "step": 677 - }, - { - "epoch": 4.159509202453988, - "grad_norm": 2.6218204498291016, - "learning_rate": 3.157126065217879e-06, - "loss": 0.0489, - "step": 678 - }, - { - "epoch": 4.1656441717791415, - "grad_norm": 4.3173723220825195, - "learning_rate": 3.152475885071606e-06, - "loss": 0.1333, - "step": 679 - }, - { - "epoch": 4.171779141104294, - "grad_norm": 3.659149408340454, - "learning_rate": 3.147823281169498e-06, - "loss": 0.1501, - "step": 680 - }, - { - "epoch": 4.177914110429448, - "grad_norm": 3.0953338146209717, - "learning_rate": 3.143168270794612e-06, - "loss": 0.1067, - "step": 681 - }, - { - "epoch": 4.184049079754601, - "grad_norm": 3.5693907737731934, - "learning_rate": 3.1385108712389394e-06, - "loss": 0.2499, - "step": 682 - }, - { - "epoch": 4.190184049079755, - "grad_norm": 3.3022868633270264, - "learning_rate": 3.1338510998033484e-06, - "loss": 0.1748, - "step": 683 - }, - { - "epoch": 4.196319018404908, - "grad_norm": 3.7468113899230957, - "learning_rate": 3.129188973797519e-06, - "loss": 0.201, - "step": 684 - }, - { - "epoch": 4.2024539877300615, - "grad_norm": 2.8381078243255615, - "learning_rate": 3.124524510539875e-06, - "loss": 0.0735, - "step": 685 - }, - { - "epoch": 4.208588957055214, - "grad_norm": 2.84706974029541, - "learning_rate": 3.119857727357527e-06, - "loss": 0.1806, - "step": 686 - }, - { - "epoch": 4.214723926380368, - "grad_norm": 3.8130292892456055, - "learning_rate": 3.1151886415861993e-06, - "loss": 0.1811, - "step": 687 - }, - { - "epoch": 4.220858895705521, - "grad_norm": 3.528895378112793, - "learning_rate": 3.1105172705701708e-06, - "loss": 0.1634, - "step": 688 - }, - { - "epoch": 4.226993865030675, - "grad_norm": 5.028727054595947, - "learning_rate": 3.1058436316622103e-06, - "loss": 0.1625, - "step": 689 - }, - { - "epoch": 4.233128834355828, - "grad_norm": 4.606889247894287, - "learning_rate": 3.1011677422235093e-06, - "loss": 0.1791, - "step": 690 - }, - { - "epoch": 4.2392638036809815, - "grad_norm": 3.3620636463165283, - "learning_rate": 3.0964896196236217e-06, - "loss": 0.2233, - "step": 691 - }, - { - "epoch": 4.245398773006135, - "grad_norm": 3.7845852375030518, - "learning_rate": 3.0918092812403954e-06, - "loss": 0.1142, - "step": 692 - }, - { - "epoch": 4.251533742331288, - "grad_norm": 3.1204118728637695, - "learning_rate": 3.0871267444599098e-06, - "loss": 0.096, - "step": 693 - }, - { - "epoch": 4.257668711656442, - "grad_norm": 3.686067819595337, - "learning_rate": 3.0824420266764093e-06, - "loss": 0.2749, - "step": 694 - }, - { - "epoch": 4.263803680981595, - "grad_norm": 3.1680829524993896, - "learning_rate": 3.077755145292243e-06, - "loss": 0.2504, - "step": 695 - }, - { - "epoch": 4.269938650306749, - "grad_norm": 3.3179469108581543, - "learning_rate": 3.0730661177177957e-06, - "loss": 0.1324, - "step": 696 - }, - { - "epoch": 4.276073619631902, - "grad_norm": 3.1186370849609375, - "learning_rate": 3.0683749613714238e-06, - "loss": 0.0691, - "step": 697 - }, - { - "epoch": 4.282208588957055, - "grad_norm": 3.086834192276001, - "learning_rate": 3.063681693679391e-06, - "loss": 0.1026, - "step": 698 - }, - { - "epoch": 4.288343558282208, - "grad_norm": 4.629584312438965, - "learning_rate": 3.0589863320758063e-06, - "loss": 0.2646, - "step": 699 - }, - { - "epoch": 4.294478527607362, - "grad_norm": 3.9641213417053223, - "learning_rate": 3.0542888940025562e-06, - "loss": 0.1711, - "step": 700 - }, - { - "epoch": 4.300613496932515, - "grad_norm": 3.75014328956604, - "learning_rate": 3.0495893969092395e-06, - "loss": 0.0589, - "step": 701 - }, - { - "epoch": 4.306748466257669, - "grad_norm": 3.603290319442749, - "learning_rate": 3.044887858253105e-06, - "loss": 0.2244, - "step": 702 - }, - { - "epoch": 4.3128834355828225, - "grad_norm": 3.79404616355896, - "learning_rate": 3.040184295498984e-06, - "loss": 0.1506, - "step": 703 - }, - { - "epoch": 4.319018404907975, - "grad_norm": 3.0890021324157715, - "learning_rate": 3.035478726119228e-06, - "loss": 0.2343, - "step": 704 - }, - { - "epoch": 4.325153374233129, - "grad_norm": 3.6688191890716553, - "learning_rate": 3.0307711675936426e-06, - "loss": 0.0518, - "step": 705 - }, - { - "epoch": 4.331288343558282, - "grad_norm": 5.1836700439453125, - "learning_rate": 3.0260616374094208e-06, - "loss": 0.2363, - "step": 706 - }, - { - "epoch": 4.337423312883436, - "grad_norm": 2.7123284339904785, - "learning_rate": 3.0213501530610807e-06, - "loss": 0.0848, - "step": 707 - }, - { - "epoch": 4.343558282208589, - "grad_norm": 3.5661890506744385, - "learning_rate": 3.0166367320504005e-06, - "loss": 0.149, - "step": 708 - }, - { - "epoch": 4.3496932515337425, - "grad_norm": 3.6454737186431885, - "learning_rate": 3.0119213918863515e-06, - "loss": 0.1133, - "step": 709 - }, - { - "epoch": 4.355828220858895, - "grad_norm": 3.7534968852996826, - "learning_rate": 3.0072041500850343e-06, - "loss": 0.1358, - "step": 710 - }, - { - "epoch": 4.361963190184049, - "grad_norm": 3.40387225151062, - "learning_rate": 3.0024850241696128e-06, - "loss": 0.0706, - "step": 711 - }, - { - "epoch": 4.368098159509202, - "grad_norm": 3.250471591949463, - "learning_rate": 2.9977640316702512e-06, - "loss": 0.1977, - "step": 712 - }, - { - "epoch": 4.374233128834356, - "grad_norm": 3.417781352996826, - "learning_rate": 2.993041190124047e-06, - "loss": 0.2622, - "step": 713 - }, - { - "epoch": 4.38036809815951, - "grad_norm": 2.628434181213379, - "learning_rate": 2.9883165170749657e-06, - "loss": 0.1487, - "step": 714 - }, - { - "epoch": 4.386503067484663, - "grad_norm": 3.240264892578125, - "learning_rate": 2.9835900300737763e-06, - "loss": 0.0822, - "step": 715 - }, - { - "epoch": 4.392638036809816, - "grad_norm": 6.575517177581787, - "learning_rate": 2.9788617466779884e-06, - "loss": 0.3668, - "step": 716 - }, - { - "epoch": 4.398773006134969, - "grad_norm": 4.699089050292969, - "learning_rate": 2.974131684451781e-06, - "loss": 0.2432, - "step": 717 - }, - { - "epoch": 4.404907975460123, - "grad_norm": 2.9815752506256104, - "learning_rate": 2.9693998609659443e-06, - "loss": 0.0689, - "step": 718 - }, - { - "epoch": 4.411042944785276, - "grad_norm": 4.192755222320557, - "learning_rate": 2.9646662937978082e-06, - "loss": 0.1897, - "step": 719 - }, - { - "epoch": 4.41717791411043, - "grad_norm": 2.9729068279266357, - "learning_rate": 2.9599310005311824e-06, - "loss": 0.0457, - "step": 720 - }, - { - "epoch": 4.423312883435583, - "grad_norm": 4.234438896179199, - "learning_rate": 2.9551939987562866e-06, - "loss": 0.2307, - "step": 721 - }, - { - "epoch": 4.429447852760736, - "grad_norm": 3.3982434272766113, - "learning_rate": 2.950455306069688e-06, - "loss": 0.0637, - "step": 722 - }, - { - "epoch": 4.435582822085889, - "grad_norm": 4.539764404296875, - "learning_rate": 2.9457149400742357e-06, - "loss": 0.1924, - "step": 723 - }, - { - "epoch": 4.441717791411043, - "grad_norm": 4.039684772491455, - "learning_rate": 2.940972918378993e-06, - "loss": 0.1275, - "step": 724 - }, - { - "epoch": 4.447852760736196, - "grad_norm": 4.340360641479492, - "learning_rate": 2.936229258599174e-06, - "loss": 0.123, - "step": 725 - }, - { - "epoch": 4.45398773006135, - "grad_norm": 2.8720109462738037, - "learning_rate": 2.93148397835608e-06, - "loss": 0.0555, - "step": 726 - }, - { - "epoch": 4.460122699386503, - "grad_norm": 4.227811336517334, - "learning_rate": 2.926737095277029e-06, - "loss": 0.0991, - "step": 727 - }, - { - "epoch": 4.466257668711656, - "grad_norm": 2.8079142570495605, - "learning_rate": 2.921988626995295e-06, - "loss": 0.0628, - "step": 728 - }, - { - "epoch": 4.47239263803681, - "grad_norm": 4.195122241973877, - "learning_rate": 2.9172385911500385e-06, - "loss": 0.2333, - "step": 729 - }, - { - "epoch": 4.478527607361963, - "grad_norm": 3.223794460296631, - "learning_rate": 2.9124870053862447e-06, - "loss": 0.1317, - "step": 730 - }, - { - "epoch": 4.484662576687117, - "grad_norm": 3.5533759593963623, - "learning_rate": 2.907733887354657e-06, - "loss": 0.2285, - "step": 731 - }, - { - "epoch": 4.49079754601227, - "grad_norm": 3.535673141479492, - "learning_rate": 2.9029792547117088e-06, - "loss": 0.096, - "step": 732 - }, - { - "epoch": 4.4969325153374236, - "grad_norm": 4.031703948974609, - "learning_rate": 2.898223125119461e-06, - "loss": 0.1505, - "step": 733 - }, - { - "epoch": 4.5030674846625764, - "grad_norm": 2.823413610458374, - "learning_rate": 2.893465516245534e-06, - "loss": 0.0327, - "step": 734 - }, - { - "epoch": 4.50920245398773, - "grad_norm": 3.516738176345825, - "learning_rate": 2.8887064457630453e-06, - "loss": 0.0743, - "step": 735 - }, - { - "epoch": 4.515337423312883, - "grad_norm": 3.5523500442504883, - "learning_rate": 2.8839459313505407e-06, - "loss": 0.1768, - "step": 736 - }, - { - "epoch": 4.521472392638037, - "grad_norm": 3.2433223724365234, - "learning_rate": 2.879183990691929e-06, - "loss": 0.1598, - "step": 737 - }, - { - "epoch": 4.52760736196319, - "grad_norm": 3.0156848430633545, - "learning_rate": 2.8744206414764185e-06, - "loss": 0.0829, - "step": 738 - }, - { - "epoch": 4.533742331288344, - "grad_norm": 4.359529495239258, - "learning_rate": 2.8696559013984488e-06, - "loss": 0.1169, - "step": 739 - }, - { - "epoch": 4.539877300613497, - "grad_norm": 2.3862433433532715, - "learning_rate": 2.8648897881576274e-06, - "loss": 0.0962, - "step": 740 - }, - { - "epoch": 4.54601226993865, - "grad_norm": 2.7100136280059814, - "learning_rate": 2.8601223194586613e-06, - "loss": 0.1204, - "step": 741 - }, - { - "epoch": 4.552147239263804, - "grad_norm": 3.8116140365600586, - "learning_rate": 2.8553535130112935e-06, - "loss": 0.0685, - "step": 742 - }, - { - "epoch": 4.558282208588957, - "grad_norm": 2.9640142917633057, - "learning_rate": 2.850583386530235e-06, - "loss": 0.0692, - "step": 743 - }, - { - "epoch": 4.564417177914111, - "grad_norm": 3.264592170715332, - "learning_rate": 2.8458119577351035e-06, - "loss": 0.2128, - "step": 744 - }, - { - "epoch": 4.570552147239264, - "grad_norm": 3.230497360229492, - "learning_rate": 2.841039244350351e-06, - "loss": 0.2409, - "step": 745 - }, - { - "epoch": 4.576687116564417, - "grad_norm": 4.41513204574585, - "learning_rate": 2.8362652641052024e-06, - "loss": 0.1878, - "step": 746 - }, - { - "epoch": 4.58282208588957, - "grad_norm": 3.047248601913452, - "learning_rate": 2.83149003473359e-06, - "loss": 0.1303, - "step": 747 - }, - { - "epoch": 4.588957055214724, - "grad_norm": 2.399754047393799, - "learning_rate": 2.8267135739740836e-06, - "loss": 0.0577, - "step": 748 - }, - { - "epoch": 4.595092024539877, - "grad_norm": 4.608038425445557, - "learning_rate": 2.8219358995698307e-06, - "loss": 0.2329, - "step": 749 - }, - { - "epoch": 4.601226993865031, - "grad_norm": 3.537644147872925, - "learning_rate": 2.8171570292684846e-06, - "loss": 0.1329, - "step": 750 - }, - { - "epoch": 4.6073619631901845, - "grad_norm": 2.8099827766418457, - "learning_rate": 2.8123769808221407e-06, - "loss": 0.1512, - "step": 751 - }, - { - "epoch": 4.613496932515337, - "grad_norm": 3.3169758319854736, - "learning_rate": 2.8075957719872724e-06, - "loss": 0.1267, - "step": 752 - }, - { - "epoch": 4.61963190184049, - "grad_norm": 3.578435182571411, - "learning_rate": 2.8028134205246633e-06, - "loss": 0.147, - "step": 753 - }, - { - "epoch": 4.625766871165644, - "grad_norm": 3.544437885284424, - "learning_rate": 2.7980299441993415e-06, - "loss": 0.0947, - "step": 754 - }, - { - "epoch": 4.631901840490798, - "grad_norm": 3.798776388168335, - "learning_rate": 2.793245360780512e-06, - "loss": 0.1498, - "step": 755 - }, - { - "epoch": 4.638036809815951, - "grad_norm": 3.634991407394409, - "learning_rate": 2.788459688041495e-06, - "loss": 0.2504, - "step": 756 - }, - { - "epoch": 4.644171779141105, - "grad_norm": 20.123680114746094, - "learning_rate": 2.783672943759655e-06, - "loss": 0.2091, - "step": 757 - }, - { - "epoch": 4.6503067484662575, - "grad_norm": 3.9357221126556396, - "learning_rate": 2.778885145716339e-06, - "loss": 0.2045, - "step": 758 - }, - { - "epoch": 4.656441717791411, - "grad_norm": 3.3035309314727783, - "learning_rate": 2.7740963116968063e-06, - "loss": 0.1416, - "step": 759 - }, - { - "epoch": 4.662576687116564, - "grad_norm": 3.096985101699829, - "learning_rate": 2.7693064594901646e-06, - "loss": 0.0455, - "step": 760 - }, - { - "epoch": 4.668711656441718, - "grad_norm": 2.9855458736419678, - "learning_rate": 2.7645156068893075e-06, - "loss": 0.1496, - "step": 761 - }, - { - "epoch": 4.674846625766871, - "grad_norm": 3.9140093326568604, - "learning_rate": 2.759723771690839e-06, - "loss": 0.2061, - "step": 762 - }, - { - "epoch": 4.680981595092025, - "grad_norm": 3.590569496154785, - "learning_rate": 2.754930971695019e-06, - "loss": 0.1017, - "step": 763 - }, - { - "epoch": 4.6871165644171775, - "grad_norm": 3.527254581451416, - "learning_rate": 2.750137224705687e-06, - "loss": 0.1979, - "step": 764 - }, - { - "epoch": 4.693251533742331, - "grad_norm": 4.198459148406982, - "learning_rate": 2.745342548530202e-06, - "loss": 0.1667, - "step": 765 - }, - { - "epoch": 4.699386503067485, - "grad_norm": 2.0246167182922363, - "learning_rate": 2.7405469609793746e-06, - "loss": 0.0346, - "step": 766 - }, - { - "epoch": 4.705521472392638, - "grad_norm": 3.2045300006866455, - "learning_rate": 2.7357504798674004e-06, - "loss": 0.0596, - "step": 767 - }, - { - "epoch": 4.711656441717792, - "grad_norm": 2.736985921859741, - "learning_rate": 2.730953123011796e-06, - "loss": 0.0384, - "step": 768 - }, - { - "epoch": 4.717791411042945, - "grad_norm": 3.0621395111083984, - "learning_rate": 2.726154908233328e-06, - "loss": 0.0558, - "step": 769 - }, - { - "epoch": 4.723926380368098, - "grad_norm": 3.2280497550964355, - "learning_rate": 2.721355853355953e-06, - "loss": 0.2272, - "step": 770 - }, - { - "epoch": 4.730061349693251, - "grad_norm": 3.342226028442383, - "learning_rate": 2.716555976206748e-06, - "loss": 0.074, - "step": 771 - }, - { - "epoch": 4.736196319018405, - "grad_norm": 4.328624248504639, - "learning_rate": 2.7117552946158415e-06, - "loss": 0.1034, - "step": 772 - }, - { - "epoch": 4.742331288343558, - "grad_norm": 2.980215311050415, - "learning_rate": 2.706953826416353e-06, - "loss": 0.1199, - "step": 773 - }, - { - "epoch": 4.748466257668712, - "grad_norm": 2.622478485107422, - "learning_rate": 2.702151589444324e-06, - "loss": 0.0467, - "step": 774 - }, - { - "epoch": 4.754601226993865, - "grad_norm": 2.9958693981170654, - "learning_rate": 2.6973486015386507e-06, - "loss": 0.143, - "step": 775 - }, - { - "epoch": 4.7607361963190185, - "grad_norm": 4.548511505126953, - "learning_rate": 2.6925448805410197e-06, - "loss": 0.3594, - "step": 776 - }, - { - "epoch": 4.766871165644172, - "grad_norm": 3.3429481983184814, - "learning_rate": 2.6877404442958393e-06, - "loss": 0.1397, - "step": 777 - }, - { - "epoch": 4.773006134969325, - "grad_norm": 2.5820136070251465, - "learning_rate": 2.682935310650177e-06, - "loss": 0.054, - "step": 778 - }, - { - "epoch": 4.779141104294479, - "grad_norm": 4.047626495361328, - "learning_rate": 2.6781294974536886e-06, - "loss": 0.1284, - "step": 779 - }, - { - "epoch": 4.785276073619632, - "grad_norm": 3.0227510929107666, - "learning_rate": 2.673323022558557e-06, - "loss": 0.1441, - "step": 780 - }, - { - "epoch": 4.791411042944786, - "grad_norm": 4.731313705444336, - "learning_rate": 2.6685159038194202e-06, - "loss": 0.2859, - "step": 781 - }, - { - "epoch": 4.7975460122699385, - "grad_norm": 3.880655288696289, - "learning_rate": 2.6637081590933096e-06, - "loss": 0.1524, - "step": 782 - }, - { - "epoch": 4.803680981595092, - "grad_norm": 2.375474452972412, - "learning_rate": 2.6588998062395803e-06, - "loss": 0.0338, - "step": 783 - }, - { - "epoch": 4.809815950920245, - "grad_norm": 3.3587446212768555, - "learning_rate": 2.6540908631198498e-06, - "loss": 0.0755, - "step": 784 - }, - { - "epoch": 4.815950920245399, - "grad_norm": 2.767686367034912, - "learning_rate": 2.6492813475979243e-06, - "loss": 0.0631, - "step": 785 - }, - { - "epoch": 4.822085889570552, - "grad_norm": 3.88670015335083, - "learning_rate": 2.6444712775397397e-06, - "loss": 0.0853, - "step": 786 - }, - { - "epoch": 4.828220858895706, - "grad_norm": 3.543276309967041, - "learning_rate": 2.639660670813288e-06, - "loss": 0.1895, - "step": 787 - }, - { - "epoch": 4.8343558282208585, - "grad_norm": 3.659323215484619, - "learning_rate": 2.6348495452885598e-06, - "loss": 0.1745, - "step": 788 - }, - { - "epoch": 4.840490797546012, - "grad_norm": 3.0955021381378174, - "learning_rate": 2.630037918837468e-06, - "loss": 0.0846, - "step": 789 - }, - { - "epoch": 4.846625766871165, - "grad_norm": 3.4473249912261963, - "learning_rate": 2.6252258093337892e-06, - "loss": 0.0808, - "step": 790 - }, - { - "epoch": 4.852760736196319, - "grad_norm": 3.937120199203491, - "learning_rate": 2.6204132346530936e-06, - "loss": 0.2054, - "step": 791 - }, - { - "epoch": 4.858895705521473, - "grad_norm": 4.052806854248047, - "learning_rate": 2.6156002126726788e-06, - "loss": 0.1679, - "step": 792 - }, - { - "epoch": 4.865030674846626, - "grad_norm": 2.6694889068603516, - "learning_rate": 2.6107867612715043e-06, - "loss": 0.0534, - "step": 793 - }, - { - "epoch": 4.871165644171779, - "grad_norm": 3.594649076461792, - "learning_rate": 2.6059728983301267e-06, - "loss": 0.0899, - "step": 794 - }, - { - "epoch": 4.877300613496932, - "grad_norm": 2.7796030044555664, - "learning_rate": 2.601158641730629e-06, - "loss": 0.0596, - "step": 795 - }, - { - "epoch": 4.883435582822086, - "grad_norm": 4.618961334228516, - "learning_rate": 2.5963440093565567e-06, - "loss": 0.3858, - "step": 796 - }, - { - "epoch": 4.889570552147239, - "grad_norm": 3.0783939361572266, - "learning_rate": 2.5915290190928518e-06, - "loss": 0.12, - "step": 797 - }, - { - "epoch": 4.895705521472393, - "grad_norm": 4.078456878662109, - "learning_rate": 2.586713688825786e-06, - "loss": 0.1278, - "step": 798 - }, - { - "epoch": 4.901840490797546, - "grad_norm": 2.9439120292663574, - "learning_rate": 2.5818980364428935e-06, - "loss": 0.0847, - "step": 799 - }, - { - "epoch": 4.9079754601226995, - "grad_norm": 5.140681743621826, - "learning_rate": 2.5770820798329055e-06, - "loss": 0.1718, - "step": 800 - }, - { - "epoch": 4.914110429447852, - "grad_norm": 3.450190305709839, - "learning_rate": 2.572265836885682e-06, - "loss": 0.0895, - "step": 801 - }, - { - "epoch": 4.920245398773006, - "grad_norm": 3.1145224571228027, - "learning_rate": 2.567449325492149e-06, - "loss": 0.0652, - "step": 802 - }, - { - "epoch": 4.92638036809816, - "grad_norm": 2.851768732070923, - "learning_rate": 2.5626325635442283e-06, - "loss": 0.0877, - "step": 803 - }, - { - "epoch": 4.932515337423313, - "grad_norm": 3.3392980098724365, - "learning_rate": 2.5578155689347716e-06, - "loss": 0.2028, - "step": 804 - }, - { - "epoch": 4.938650306748467, - "grad_norm": 3.012439250946045, - "learning_rate": 2.5529983595574964e-06, - "loss": 0.031, - "step": 805 - }, - { - "epoch": 4.9447852760736195, - "grad_norm": 2.7732717990875244, - "learning_rate": 2.548180953306918e-06, - "loss": 0.0415, - "step": 806 - }, - { - "epoch": 4.950920245398773, - "grad_norm": 3.0423903465270996, - "learning_rate": 2.5433633680782817e-06, - "loss": 0.1188, - "step": 807 - }, - { - "epoch": 4.957055214723926, - "grad_norm": 5.056387901306152, - "learning_rate": 2.538545621767498e-06, - "loss": 0.1703, - "step": 808 - }, - { - "epoch": 4.96319018404908, - "grad_norm": 4.052585124969482, - "learning_rate": 2.533727732271077e-06, - "loss": 0.1455, - "step": 809 - }, - { - "epoch": 4.969325153374233, - "grad_norm": 3.4507904052734375, - "learning_rate": 2.5289097174860593e-06, - "loss": 0.0617, - "step": 810 - }, - { - "epoch": 4.975460122699387, - "grad_norm": 2.908266305923462, - "learning_rate": 2.524091595309952e-06, - "loss": 0.1173, - "step": 811 - }, - { - "epoch": 4.9815950920245395, - "grad_norm": 2.5857458114624023, - "learning_rate": 2.519273383640661e-06, - "loss": 0.0538, - "step": 812 - }, - { - "epoch": 4.987730061349693, - "grad_norm": 3.3518428802490234, - "learning_rate": 2.5144551003764227e-06, - "loss": 0.211, - "step": 813 - }, - { - "epoch": 4.993865030674847, - "grad_norm": 3.137981653213501, - "learning_rate": 2.509636763415742e-06, - "loss": 0.0944, - "step": 814 - }, - { - "epoch": 5.0, - "grad_norm": 2.8854241371154785, - "learning_rate": 2.5048183906573227e-06, - "loss": 0.098, - "step": 815 - }, - { - "epoch": 5.006134969325154, - "grad_norm": 3.508527994155884, - "learning_rate": 2.5e-06, - "loss": 0.1102, - "step": 816 - }, - { - "epoch": 5.012269938650307, - "grad_norm": 2.448152542114258, - "learning_rate": 2.495181609342678e-06, - "loss": 0.0712, - "step": 817 - }, - { - "epoch": 5.0184049079754605, - "grad_norm": 3.105818748474121, - "learning_rate": 2.4903632365842587e-06, - "loss": 0.0414, - "step": 818 - }, - { - "epoch": 5.024539877300613, - "grad_norm": 3.8048601150512695, - "learning_rate": 2.4855448996235777e-06, - "loss": 0.0894, - "step": 819 - }, - { - "epoch": 5.030674846625767, - "grad_norm": 3.259834051132202, - "learning_rate": 2.48072661635934e-06, - "loss": 0.0796, - "step": 820 - }, - { - "epoch": 5.03680981595092, - "grad_norm": 2.822364568710327, - "learning_rate": 2.475908404690049e-06, - "loss": 0.0349, - "step": 821 - }, - { - "epoch": 5.042944785276074, - "grad_norm": 4.78808069229126, - "learning_rate": 2.4710902825139415e-06, - "loss": 0.2529, - "step": 822 - }, - { - "epoch": 5.049079754601227, - "grad_norm": 3.5420572757720947, - "learning_rate": 2.466272267728924e-06, - "loss": 0.1405, - "step": 823 - }, - { - "epoch": 5.0552147239263805, - "grad_norm": 2.500713348388672, - "learning_rate": 2.461454378232503e-06, - "loss": 0.0408, - "step": 824 - }, - { - "epoch": 5.061349693251533, - "grad_norm": 3.266291618347168, - "learning_rate": 2.4566366319217196e-06, - "loss": 0.0338, - "step": 825 - }, - { - "epoch": 5.067484662576687, - "grad_norm": 4.071012020111084, - "learning_rate": 2.4518190466930837e-06, - "loss": 0.06, - "step": 826 - }, - { - "epoch": 5.07361963190184, - "grad_norm": 4.3747172355651855, - "learning_rate": 2.4470016404425045e-06, - "loss": 0.1184, - "step": 827 - }, - { - "epoch": 5.079754601226994, - "grad_norm": 3.92030668258667, - "learning_rate": 2.4421844310652296e-06, - "loss": 0.1369, - "step": 828 - }, - { - "epoch": 5.085889570552148, - "grad_norm": 3.3482303619384766, - "learning_rate": 2.437367436455773e-06, - "loss": 0.1166, - "step": 829 - }, - { - "epoch": 5.0920245398773005, - "grad_norm": 3.429368019104004, - "learning_rate": 2.4325506745078524e-06, - "loss": 0.1214, - "step": 830 - }, - { - "epoch": 5.098159509202454, - "grad_norm": 3.4915647506713867, - "learning_rate": 2.427734163114319e-06, - "loss": 0.0454, - "step": 831 - }, - { - "epoch": 5.104294478527607, - "grad_norm": 3.1721251010894775, - "learning_rate": 2.4229179201670954e-06, - "loss": 0.0431, - "step": 832 - }, - { - "epoch": 5.110429447852761, - "grad_norm": 2.552578926086426, - "learning_rate": 2.418101963557107e-06, - "loss": 0.0347, - "step": 833 - }, - { - "epoch": 5.116564417177914, - "grad_norm": 3.518169403076172, - "learning_rate": 2.413286311174214e-06, - "loss": 0.1555, - "step": 834 - }, - { - "epoch": 5.122699386503068, - "grad_norm": 2.4452908039093018, - "learning_rate": 2.4084709809071487e-06, - "loss": 0.035, - "step": 835 - }, - { - "epoch": 5.128834355828221, - "grad_norm": 3.5366528034210205, - "learning_rate": 2.403655990643444e-06, - "loss": 0.0798, - "step": 836 - }, - { - "epoch": 5.134969325153374, - "grad_norm": 2.300065040588379, - "learning_rate": 2.398841358269371e-06, - "loss": 0.0178, - "step": 837 - }, - { - "epoch": 5.141104294478527, - "grad_norm": 2.851393699645996, - "learning_rate": 2.3940271016698733e-06, - "loss": 0.0447, - "step": 838 - }, - { - "epoch": 5.147239263803681, - "grad_norm": 4.085958957672119, - "learning_rate": 2.3892132387284956e-06, - "loss": 0.1626, - "step": 839 - }, - { - "epoch": 5.153374233128835, - "grad_norm": 3.4240522384643555, - "learning_rate": 2.384399787327322e-06, - "loss": 0.0914, - "step": 840 - }, - { - "epoch": 5.159509202453988, - "grad_norm": 4.111586570739746, - "learning_rate": 2.3795867653469072e-06, - "loss": 0.0784, - "step": 841 - }, - { - "epoch": 5.1656441717791415, - "grad_norm": 2.3306312561035156, - "learning_rate": 2.374774190666211e-06, - "loss": 0.0216, - "step": 842 - }, - { - "epoch": 5.171779141104294, - "grad_norm": 2.5006275177001953, - "learning_rate": 2.3699620811625327e-06, - "loss": 0.0516, - "step": 843 - }, - { - "epoch": 5.177914110429448, - "grad_norm": 3.1680967807769775, - "learning_rate": 2.365150454711441e-06, - "loss": 0.0517, - "step": 844 - }, - { - "epoch": 5.184049079754601, - "grad_norm": 1.817044734954834, - "learning_rate": 2.3603393291867122e-06, - "loss": 0.0264, - "step": 845 - }, - { - "epoch": 5.190184049079755, - "grad_norm": 4.445211887359619, - "learning_rate": 2.355528722460261e-06, - "loss": 0.1079, - "step": 846 - }, - { - "epoch": 5.196319018404908, - "grad_norm": 2.918304681777954, - "learning_rate": 2.350718652402076e-06, - "loss": 0.0633, - "step": 847 - }, - { - "epoch": 5.2024539877300615, - "grad_norm": 3.6307432651519775, - "learning_rate": 2.345909136880151e-06, - "loss": 0.1013, - "step": 848 - }, - { - "epoch": 5.208588957055214, - "grad_norm": 3.5696842670440674, - "learning_rate": 2.34110019376042e-06, - "loss": 0.0199, - "step": 849 - }, - { - "epoch": 5.214723926380368, - "grad_norm": 2.2214856147766113, - "learning_rate": 2.336291840906691e-06, - "loss": 0.0288, - "step": 850 - }, - { - "epoch": 5.220858895705521, - "grad_norm": 2.5375778675079346, - "learning_rate": 2.3314840961805806e-06, - "loss": 0.0142, - "step": 851 - }, - { - "epoch": 5.226993865030675, - "grad_norm": 3.0093517303466797, - "learning_rate": 2.326676977441444e-06, - "loss": 0.0911, - "step": 852 - }, - { - "epoch": 5.233128834355828, - "grad_norm": 2.7067151069641113, - "learning_rate": 2.3218705025463118e-06, - "loss": 0.0315, - "step": 853 - }, - { - "epoch": 5.2392638036809815, - "grad_norm": 3.1892940998077393, - "learning_rate": 2.3170646893498237e-06, - "loss": 0.1344, - "step": 854 - }, - { - "epoch": 5.245398773006135, - "grad_norm": 2.8909313678741455, - "learning_rate": 2.312259555704161e-06, - "loss": 0.034, - "step": 855 - }, - { - "epoch": 5.251533742331288, - "grad_norm": 5.097650051116943, - "learning_rate": 2.3074551194589816e-06, - "loss": 0.1889, - "step": 856 - }, - { - "epoch": 5.257668711656442, - "grad_norm": 3.8511006832122803, - "learning_rate": 2.3026513984613506e-06, - "loss": 0.0794, - "step": 857 - }, - { - "epoch": 5.263803680981595, - "grad_norm": 2.2874133586883545, - "learning_rate": 2.297848410555677e-06, - "loss": 0.0238, - "step": 858 - }, - { - "epoch": 5.269938650306749, - "grad_norm": 3.504723310470581, - "learning_rate": 2.293046173583648e-06, - "loss": 0.0369, - "step": 859 - }, - { - "epoch": 5.276073619631902, - "grad_norm": 3.2108154296875, - "learning_rate": 2.28824470538416e-06, - "loss": 0.0677, - "step": 860 - }, - { - "epoch": 5.282208588957055, - "grad_norm": 2.2249386310577393, - "learning_rate": 2.2834440237932537e-06, - "loss": 0.0244, - "step": 861 - }, - { - "epoch": 5.288343558282208, - "grad_norm": 3.141784191131592, - "learning_rate": 2.2786441466440474e-06, - "loss": 0.0628, - "step": 862 - }, - { - "epoch": 5.294478527607362, - "grad_norm": 3.5597352981567383, - "learning_rate": 2.2738450917666727e-06, - "loss": 0.0914, - "step": 863 - }, - { - "epoch": 5.300613496932515, - "grad_norm": 2.991966962814331, - "learning_rate": 2.269046876988204e-06, - "loss": 0.0546, - "step": 864 - }, - { - "epoch": 5.306748466257669, - "grad_norm": 3.100776195526123, - "learning_rate": 2.2642495201325995e-06, - "loss": 0.0473, - "step": 865 - }, - { - "epoch": 5.3128834355828225, - "grad_norm": 2.541754722595215, - "learning_rate": 2.259453039020626e-06, - "loss": 0.0613, - "step": 866 - }, - { - "epoch": 5.319018404907975, - "grad_norm": 2.8117194175720215, - "learning_rate": 2.2546574514697985e-06, - "loss": 0.0533, - "step": 867 - }, - { - "epoch": 5.325153374233129, - "grad_norm": 2.5676379203796387, - "learning_rate": 2.249862775294313e-06, - "loss": 0.018, - "step": 868 - }, - { - "epoch": 5.331288343558282, - "grad_norm": 2.5297701358795166, - "learning_rate": 2.245069028304981e-06, - "loss": 0.0246, - "step": 869 - }, - { - "epoch": 5.337423312883436, - "grad_norm": 2.199498176574707, - "learning_rate": 2.240276228309161e-06, - "loss": 0.0551, - "step": 870 - }, - { - "epoch": 5.343558282208589, - "grad_norm": 2.5793557167053223, - "learning_rate": 2.2354843931106933e-06, - "loss": 0.0258, - "step": 871 - }, - { - "epoch": 5.3496932515337425, - "grad_norm": 3.352058172225952, - "learning_rate": 2.230693540509836e-06, - "loss": 0.0228, - "step": 872 - }, - { - "epoch": 5.355828220858895, - "grad_norm": 2.900599956512451, - "learning_rate": 2.225903688303195e-06, - "loss": 0.0586, - "step": 873 - }, - { - "epoch": 5.361963190184049, - "grad_norm": 3.3317267894744873, - "learning_rate": 2.221114854283662e-06, - "loss": 0.0733, - "step": 874 - }, - { - "epoch": 5.368098159509202, - "grad_norm": 2.79304575920105, - "learning_rate": 2.2163270562403453e-06, - "loss": 0.0251, - "step": 875 - }, - { - "epoch": 5.374233128834356, - "grad_norm": 3.8596227169036865, - "learning_rate": 2.211540311958506e-06, - "loss": 0.0957, - "step": 876 - }, - { - "epoch": 5.38036809815951, - "grad_norm": 2.7464358806610107, - "learning_rate": 2.2067546392194888e-06, - "loss": 0.0457, - "step": 877 - }, - { - "epoch": 5.386503067484663, - "grad_norm": 2.3359906673431396, - "learning_rate": 2.2019700558006598e-06, - "loss": 0.0218, - "step": 878 - }, - { - "epoch": 5.392638036809816, - "grad_norm": 3.2412452697753906, - "learning_rate": 2.197186579475337e-06, - "loss": 0.0494, - "step": 879 - }, - { - "epoch": 5.398773006134969, - "grad_norm": 3.930197238922119, - "learning_rate": 2.1924042280127284e-06, - "loss": 0.0803, - "step": 880 - }, - { - "epoch": 5.404907975460123, - "grad_norm": 2.5752930641174316, - "learning_rate": 2.1876230191778598e-06, - "loss": 0.0356, - "step": 881 - }, - { - "epoch": 5.411042944785276, - "grad_norm": 5.507393836975098, - "learning_rate": 2.182842970731516e-06, - "loss": 0.1245, - "step": 882 - }, - { - "epoch": 5.41717791411043, - "grad_norm": 2.416719436645508, - "learning_rate": 2.17806410043017e-06, - "loss": 0.0224, - "step": 883 - }, - { - "epoch": 5.423312883435583, - "grad_norm": 2.500429630279541, - "learning_rate": 2.173286426025917e-06, - "loss": 0.0499, - "step": 884 - }, - { - "epoch": 5.429447852760736, - "grad_norm": 2.8843860626220703, - "learning_rate": 2.168509965266411e-06, - "loss": 0.075, - "step": 885 - }, - { - "epoch": 5.435582822085889, - "grad_norm": 2.3187198638916016, - "learning_rate": 2.1637347358947984e-06, - "loss": 0.065, - "step": 886 - }, - { - "epoch": 5.441717791411043, - "grad_norm": 2.7135889530181885, - "learning_rate": 2.15896075564965e-06, - "loss": 0.0848, - "step": 887 - }, - { - "epoch": 5.447852760736196, - "grad_norm": 1.751846194267273, - "learning_rate": 2.1541880422648978e-06, - "loss": 0.0112, - "step": 888 - }, - { - "epoch": 5.45398773006135, - "grad_norm": 3.113271713256836, - "learning_rate": 2.1494166134697655e-06, - "loss": 0.077, - "step": 889 - }, - { - "epoch": 5.460122699386503, - "grad_norm": 2.711318016052246, - "learning_rate": 2.1446464869887077e-06, - "loss": 0.03, - "step": 890 - }, - { - "epoch": 5.466257668711656, - "grad_norm": 1.8012003898620605, - "learning_rate": 2.13987768054134e-06, - "loss": 0.0141, - "step": 891 - }, - { - "epoch": 5.47239263803681, - "grad_norm": 2.0968120098114014, - "learning_rate": 2.135110211842374e-06, - "loss": 0.0147, - "step": 892 - }, - { - "epoch": 5.478527607361963, - "grad_norm": 3.1689956188201904, - "learning_rate": 2.1303440986015525e-06, - "loss": 0.1123, - "step": 893 - }, - { - "epoch": 5.484662576687117, - "grad_norm": 4.512697219848633, - "learning_rate": 2.1255793585235827e-06, - "loss": 0.0359, - "step": 894 - }, - { - "epoch": 5.49079754601227, - "grad_norm": 3.5739688873291016, - "learning_rate": 2.120816009308071e-06, - "loss": 0.0635, - "step": 895 - }, - { - "epoch": 5.4969325153374236, - "grad_norm": 4.556554317474365, - "learning_rate": 2.1160540686494597e-06, - "loss": 0.1104, - "step": 896 - }, - { - "epoch": 5.5030674846625764, - "grad_norm": 2.2047064304351807, - "learning_rate": 2.1112935542369546e-06, - "loss": 0.0187, - "step": 897 - }, - { - "epoch": 5.50920245398773, - "grad_norm": 3.0289857387542725, - "learning_rate": 2.106534483754466e-06, - "loss": 0.0874, - "step": 898 - }, - { - "epoch": 5.515337423312883, - "grad_norm": 2.7090444564819336, - "learning_rate": 2.1017768748805396e-06, - "loss": 0.0301, - "step": 899 - }, - { - "epoch": 5.521472392638037, - "grad_norm": 3.0662643909454346, - "learning_rate": 2.0970207452882917e-06, - "loss": 0.1192, - "step": 900 - }, - { - "epoch": 5.52760736196319, - "grad_norm": 2.869401454925537, - "learning_rate": 2.0922661126453436e-06, - "loss": 0.0803, - "step": 901 - }, - { - "epoch": 5.533742331288344, - "grad_norm": 2.229947328567505, - "learning_rate": 2.0875129946137557e-06, - "loss": 0.0186, - "step": 902 - }, - { - "epoch": 5.539877300613497, - "grad_norm": 3.3460421562194824, - "learning_rate": 2.0827614088499624e-06, - "loss": 0.0499, - "step": 903 - }, - { - "epoch": 5.54601226993865, - "grad_norm": 1.9324007034301758, - "learning_rate": 2.0780113730047056e-06, - "loss": 0.0322, - "step": 904 - }, - { - "epoch": 5.552147239263804, - "grad_norm": 2.761482000350952, - "learning_rate": 2.0732629047229712e-06, - "loss": 0.0265, - "step": 905 - }, - { - "epoch": 5.558282208588957, - "grad_norm": 2.4173266887664795, - "learning_rate": 2.0685160216439205e-06, - "loss": 0.0229, - "step": 906 - }, - { - "epoch": 5.564417177914111, - "grad_norm": 2.503661632537842, - "learning_rate": 2.0637707414008267e-06, - "loss": 0.0266, - "step": 907 - }, - { - "epoch": 5.570552147239264, - "grad_norm": 2.312236785888672, - "learning_rate": 2.0590270816210077e-06, - "loss": 0.018, - "step": 908 - }, - { - "epoch": 5.576687116564417, - "grad_norm": 2.569575548171997, - "learning_rate": 2.0542850599257647e-06, - "loss": 0.0377, - "step": 909 - }, - { - "epoch": 5.58282208588957, - "grad_norm": 3.520341157913208, - "learning_rate": 2.0495446939303122e-06, - "loss": 0.1224, - "step": 910 - }, - { - "epoch": 5.588957055214724, - "grad_norm": 3.231363296508789, - "learning_rate": 2.044806001243714e-06, - "loss": 0.1457, - "step": 911 - }, - { - "epoch": 5.595092024539877, - "grad_norm": 3.3211300373077393, - "learning_rate": 2.040068999468818e-06, - "loss": 0.0429, - "step": 912 - }, - { - "epoch": 5.601226993865031, - "grad_norm": 3.3712961673736572, - "learning_rate": 2.035333706202192e-06, - "loss": 0.0634, - "step": 913 - }, - { - "epoch": 5.6073619631901845, - "grad_norm": 2.480177402496338, - "learning_rate": 2.0306001390340565e-06, - "loss": 0.0178, - "step": 914 - }, - { - "epoch": 5.613496932515337, - "grad_norm": 2.9777421951293945, - "learning_rate": 2.02586831554822e-06, - "loss": 0.037, - "step": 915 - }, - { - "epoch": 5.61963190184049, - "grad_norm": 2.9129085540771484, - "learning_rate": 2.021138253322012e-06, - "loss": 0.125, - "step": 916 - }, - { - "epoch": 5.625766871165644, - "grad_norm": 4.041767597198486, - "learning_rate": 2.016409969926224e-06, - "loss": 0.1897, - "step": 917 - }, - { - "epoch": 5.631901840490798, - "grad_norm": 4.088902950286865, - "learning_rate": 2.0116834829250355e-06, - "loss": 0.0546, - "step": 918 - }, - { - "epoch": 5.638036809815951, - "grad_norm": 3.8629167079925537, - "learning_rate": 2.0069588098759545e-06, - "loss": 0.0911, - "step": 919 - }, - { - "epoch": 5.644171779141105, - "grad_norm": 2.616830825805664, - "learning_rate": 2.00223596832975e-06, - "loss": 0.0527, - "step": 920 - }, - { - "epoch": 5.6503067484662575, - "grad_norm": 1.9370782375335693, - "learning_rate": 1.9975149758303885e-06, - "loss": 0.0384, - "step": 921 - }, - { - "epoch": 5.656441717791411, - "grad_norm": 3.7839455604553223, - "learning_rate": 1.992795849914967e-06, - "loss": 0.1033, - "step": 922 - }, - { - "epoch": 5.662576687116564, - "grad_norm": 3.870729923248291, - "learning_rate": 1.9880786081136498e-06, - "loss": 0.08, - "step": 923 - }, - { - "epoch": 5.668711656441718, - "grad_norm": 3.4394288063049316, - "learning_rate": 1.9833632679496008e-06, - "loss": 0.0819, - "step": 924 - }, - { - "epoch": 5.674846625766871, - "grad_norm": 3.1659159660339355, - "learning_rate": 1.97864984693892e-06, - "loss": 0.117, - "step": 925 - }, - { - "epoch": 5.680981595092025, - "grad_norm": 2.2375190258026123, - "learning_rate": 1.97393836259058e-06, - "loss": 0.0215, - "step": 926 - }, - { - "epoch": 5.6871165644171775, - "grad_norm": 3.9375314712524414, - "learning_rate": 1.969228832406358e-06, - "loss": 0.1422, - "step": 927 - }, - { - "epoch": 5.693251533742331, - "grad_norm": 3.1969058513641357, - "learning_rate": 1.964521273880772e-06, - "loss": 0.0538, - "step": 928 - }, - { - "epoch": 5.699386503067485, - "grad_norm": 3.5990066528320312, - "learning_rate": 1.9598157045010162e-06, - "loss": 0.114, - "step": 929 - }, - { - "epoch": 5.705521472392638, - "grad_norm": 3.1764235496520996, - "learning_rate": 1.9551121417468955e-06, - "loss": 0.053, - "step": 930 - }, - { - "epoch": 5.711656441717792, - "grad_norm": 4.1162309646606445, - "learning_rate": 1.9504106030907605e-06, - "loss": 0.0866, - "step": 931 - }, - { - "epoch": 5.717791411042945, - "grad_norm": 3.543071985244751, - "learning_rate": 1.945711105997444e-06, - "loss": 0.0908, - "step": 932 - }, - { - "epoch": 5.723926380368098, - "grad_norm": 4.136870384216309, - "learning_rate": 1.941013667924194e-06, - "loss": 0.0612, - "step": 933 - }, - { - "epoch": 5.730061349693251, - "grad_norm": 1.7658357620239258, - "learning_rate": 1.9363183063206097e-06, - "loss": 0.0283, - "step": 934 - }, - { - "epoch": 5.736196319018405, - "grad_norm": 3.9701411724090576, - "learning_rate": 1.931625038628577e-06, - "loss": 0.0948, - "step": 935 - }, - { - "epoch": 5.742331288343558, - "grad_norm": 3.0636157989501953, - "learning_rate": 1.9269338822822047e-06, - "loss": 0.0769, - "step": 936 - }, - { - "epoch": 5.748466257668712, - "grad_norm": 3.3671388626098633, - "learning_rate": 1.9222448547077573e-06, - "loss": 0.098, - "step": 937 - }, - { - "epoch": 5.754601226993865, - "grad_norm": 3.0725975036621094, - "learning_rate": 1.917557973323591e-06, - "loss": 0.0363, - "step": 938 - }, - { - "epoch": 5.7607361963190185, - "grad_norm": 2.5592041015625, - "learning_rate": 1.9128732555400915e-06, - "loss": 0.0205, - "step": 939 - }, - { - "epoch": 5.766871165644172, - "grad_norm": 2.835740804672241, - "learning_rate": 1.9081907187596054e-06, - "loss": 0.0548, - "step": 940 - }, - { - "epoch": 5.773006134969325, - "grad_norm": 3.3596746921539307, - "learning_rate": 1.9035103803763793e-06, - "loss": 0.0454, - "step": 941 - }, - { - "epoch": 5.779141104294479, - "grad_norm": 3.226579427719116, - "learning_rate": 1.8988322577764918e-06, - "loss": 0.0514, - "step": 942 - }, - { - "epoch": 5.785276073619632, - "grad_norm": 3.2044687271118164, - "learning_rate": 1.8941563683377905e-06, - "loss": 0.1361, - "step": 943 - }, - { - "epoch": 5.791411042944786, - "grad_norm": 1.8300527334213257, - "learning_rate": 1.8894827294298296e-06, - "loss": 0.0139, - "step": 944 - }, - { - "epoch": 5.7975460122699385, - "grad_norm": 2.503735303878784, - "learning_rate": 1.884811358413801e-06, - "loss": 0.0311, - "step": 945 - }, - { - "epoch": 5.803680981595092, - "grad_norm": 2.171309471130371, - "learning_rate": 1.8801422726424735e-06, - "loss": 0.0227, - "step": 946 - }, - { - "epoch": 5.809815950920245, - "grad_norm": 1.8116636276245117, - "learning_rate": 1.8754754894601252e-06, - "loss": 0.0157, - "step": 947 - }, - { - "epoch": 5.815950920245399, - "grad_norm": 3.1412570476531982, - "learning_rate": 1.870811026202482e-06, - "loss": 0.1093, - "step": 948 - }, - { - "epoch": 5.822085889570552, - "grad_norm": 2.3962290287017822, - "learning_rate": 1.8661489001966526e-06, - "loss": 0.021, - "step": 949 - }, - { - "epoch": 5.828220858895706, - "grad_norm": 4.169166564941406, - "learning_rate": 1.8614891287610621e-06, - "loss": 0.0663, - "step": 950 - }, - { - "epoch": 5.8343558282208585, - "grad_norm": 3.1181528568267822, - "learning_rate": 1.8568317292053894e-06, - "loss": 0.1008, - "step": 951 - }, - { - "epoch": 5.840490797546012, - "grad_norm": 3.5155029296875, - "learning_rate": 1.8521767188305023e-06, - "loss": 0.0451, - "step": 952 - }, - { - "epoch": 5.846625766871165, - "grad_norm": 2.975693702697754, - "learning_rate": 1.8475241149283957e-06, - "loss": 0.0561, - "step": 953 - }, - { - "epoch": 5.852760736196319, - "grad_norm": 2.1581289768218994, - "learning_rate": 1.842873934782122e-06, - "loss": 0.0265, - "step": 954 - }, - { - "epoch": 5.858895705521473, - "grad_norm": 2.6281228065490723, - "learning_rate": 1.8382261956657318e-06, - "loss": 0.1196, - "step": 955 - }, - { - "epoch": 5.865030674846626, - "grad_norm": 2.9569528102874756, - "learning_rate": 1.8335809148442074e-06, - "loss": 0.1356, - "step": 956 - }, - { - "epoch": 5.871165644171779, - "grad_norm": 2.450949192047119, - "learning_rate": 1.8289381095734005e-06, - "loss": 0.0444, - "step": 957 - }, - { - "epoch": 5.877300613496932, - "grad_norm": 2.1737027168273926, - "learning_rate": 1.8242977970999643e-06, - "loss": 0.0622, - "step": 958 - }, - { - "epoch": 5.883435582822086, - "grad_norm": 3.350647211074829, - "learning_rate": 1.8196599946612956e-06, - "loss": 0.0762, - "step": 959 - }, - { - "epoch": 5.889570552147239, - "grad_norm": 2.5031936168670654, - "learning_rate": 1.8150247194854642e-06, - "loss": 0.0207, - "step": 960 - }, - { - "epoch": 5.895705521472393, - "grad_norm": 3.7103707790374756, - "learning_rate": 1.8103919887911525e-06, - "loss": 0.1122, - "step": 961 - }, - { - "epoch": 5.901840490797546, - "grad_norm": 2.485322952270508, - "learning_rate": 1.8057618197875914e-06, - "loss": 0.0284, - "step": 962 - }, - { - "epoch": 5.9079754601226995, - "grad_norm": 1.903212547302246, - "learning_rate": 1.8011342296744961e-06, - "loss": 0.0239, - "step": 963 - }, - { - "epoch": 5.914110429447852, - "grad_norm": 3.015552520751953, - "learning_rate": 1.796509235642001e-06, - "loss": 0.0425, - "step": 964 - }, - { - "epoch": 5.920245398773006, - "grad_norm": 4.806198596954346, - "learning_rate": 1.7918868548705982e-06, - "loss": 0.2094, - "step": 965 - }, - { - "epoch": 5.92638036809816, - "grad_norm": 2.949596643447876, - "learning_rate": 1.7872671045310703e-06, - "loss": 0.0632, - "step": 966 - }, - { - "epoch": 5.932515337423313, - "grad_norm": 4.153099536895752, - "learning_rate": 1.782650001784431e-06, - "loss": 0.1411, - "step": 967 - }, - { - "epoch": 5.938650306748467, - "grad_norm": 3.4117565155029297, - "learning_rate": 1.7780355637818568e-06, - "loss": 0.0965, - "step": 968 - }, - { - "epoch": 5.9447852760736195, - "grad_norm": 2.533405303955078, - "learning_rate": 1.7734238076646277e-06, - "loss": 0.0568, - "step": 969 - }, - { - "epoch": 5.950920245398773, - "grad_norm": 2.3604726791381836, - "learning_rate": 1.7688147505640581e-06, - "loss": 0.0182, - "step": 970 - }, - { - "epoch": 5.957055214723926, - "grad_norm": 3.807424306869507, - "learning_rate": 1.7642084096014405e-06, - "loss": 0.0547, - "step": 971 - }, - { - "epoch": 5.96319018404908, - "grad_norm": 2.5735342502593994, - "learning_rate": 1.759604801887974e-06, - "loss": 0.0775, - "step": 972 - }, - { - "epoch": 5.969325153374233, - "grad_norm": 2.9217734336853027, - "learning_rate": 1.7550039445247069e-06, - "loss": 0.0541, - "step": 973 - }, - { - "epoch": 5.975460122699387, - "grad_norm": 2.793104410171509, - "learning_rate": 1.7504058546024694e-06, - "loss": 0.0257, - "step": 974 - }, - { - "epoch": 5.9815950920245395, - "grad_norm": 3.5610134601593018, - "learning_rate": 1.7458105492018114e-06, - "loss": 0.0767, - "step": 975 - }, - { - "epoch": 5.987730061349693, - "grad_norm": 2.0738015174865723, - "learning_rate": 1.7412180453929412e-06, - "loss": 0.025, - "step": 976 - }, - { - "epoch": 5.993865030674847, - "grad_norm": 2.1248421669006348, - "learning_rate": 1.736628360235657e-06, - "loss": 0.0183, - "step": 977 - }, - { - "epoch": 6.0, - "grad_norm": 2.901273727416992, - "learning_rate": 1.7320415107792893e-06, - "loss": 0.1369, - "step": 978 - }, - { - "epoch": 6.006134969325154, - "grad_norm": 3.815110683441162, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.1011, - "step": 979 - }, - { - "epoch": 6.012269938650307, - "grad_norm": 2.421208381652832, - "learning_rate": 1.7228763871138845e-06, - "loss": 0.0105, - "step": 980 - }, - { - "epoch": 6.0184049079754605, - "grad_norm": 2.7103846073150635, - "learning_rate": 1.718298146950585e-06, - "loss": 0.0373, - "step": 981 - }, - { - "epoch": 6.024539877300613, - "grad_norm": 1.3751411437988281, - "learning_rate": 1.7137228105795473e-06, - "loss": 0.0072, - "step": 982 - }, - { - "epoch": 6.030674846625767, - "grad_norm": 1.5235071182250977, - "learning_rate": 1.7091503949967987e-06, - "loss": 0.0126, - "step": 983 - }, - { - "epoch": 6.03680981595092, - "grad_norm": 2.0652546882629395, - "learning_rate": 1.7045809171875183e-06, - "loss": 0.0198, - "step": 984 - }, - { - "epoch": 6.042944785276074, - "grad_norm": 2.010207176208496, - "learning_rate": 1.70001439412597e-06, - "loss": 0.0186, - "step": 985 - }, - { - "epoch": 6.049079754601227, - "grad_norm": 2.0444021224975586, - "learning_rate": 1.6954508427754435e-06, - "loss": 0.0197, - "step": 986 - }, - { - "epoch": 6.0552147239263805, - "grad_norm": 2.6540091037750244, - "learning_rate": 1.690890280088187e-06, - "loss": 0.0192, - "step": 987 - }, - { - "epoch": 6.061349693251533, - "grad_norm": 1.6479653120040894, - "learning_rate": 1.6863327230053506e-06, - "loss": 0.0105, - "step": 988 - }, - { - "epoch": 6.067484662576687, - "grad_norm": 2.4434754848480225, - "learning_rate": 1.6817781884569146e-06, - "loss": 0.0275, - "step": 989 - }, - { - "epoch": 6.07361963190184, - "grad_norm": 1.7472137212753296, - "learning_rate": 1.677226693361636e-06, - "loss": 0.0095, - "step": 990 - }, - { - "epoch": 6.079754601226994, - "grad_norm": 2.952821969985962, - "learning_rate": 1.6726782546269793e-06, - "loss": 0.0483, - "step": 991 - }, - { - "epoch": 6.085889570552148, - "grad_norm": 3.123959541320801, - "learning_rate": 1.6681328891490544e-06, - "loss": 0.0815, - "step": 992 - }, - { - "epoch": 6.0920245398773005, - "grad_norm": 2.9924800395965576, - "learning_rate": 1.663590613812556e-06, - "loss": 0.0216, - "step": 993 - }, - { - "epoch": 6.098159509202454, - "grad_norm": 2.417778730392456, - "learning_rate": 1.6590514454907007e-06, - "loss": 0.0243, - "step": 994 - }, - { - "epoch": 6.104294478527607, - "grad_norm": 2.0682942867279053, - "learning_rate": 1.6545154010451613e-06, - "loss": 0.0669, - "step": 995 - }, - { - "epoch": 6.110429447852761, - "grad_norm": 2.9801135063171387, - "learning_rate": 1.6499824973260086e-06, - "loss": 0.0309, - "step": 996 - }, - { - "epoch": 6.116564417177914, - "grad_norm": 1.5753487348556519, - "learning_rate": 1.645452751171645e-06, - "loss": 0.026, - "step": 997 - }, - { - "epoch": 6.122699386503068, - "grad_norm": 2.461124897003174, - "learning_rate": 1.6409261794087438e-06, - "loss": 0.0191, - "step": 998 - }, - { - "epoch": 6.128834355828221, - "grad_norm": 3.839308261871338, - "learning_rate": 1.6364027988521875e-06, - "loss": 0.045, - "step": 999 - }, - { - "epoch": 6.134969325153374, - "grad_norm": 2.9653189182281494, - "learning_rate": 1.6318826263050022e-06, - "loss": 0.0197, - "step": 1000 - }, - { - "epoch": 6.141104294478527, - "grad_norm": 1.1804074048995972, - "learning_rate": 1.6273656785582986e-06, - "loss": 0.0092, - "step": 1001 - }, - { - "epoch": 6.147239263803681, - "grad_norm": 1.9027175903320312, - "learning_rate": 1.6228519723912073e-06, - "loss": 0.0141, - "step": 1002 - }, - { - "epoch": 6.153374233128835, - "grad_norm": 1.831039309501648, - "learning_rate": 1.618341524570819e-06, - "loss": 0.0131, - "step": 1003 - }, - { - "epoch": 6.159509202453988, - "grad_norm": 2.547327756881714, - "learning_rate": 1.613834351852119e-06, - "loss": 0.0686, - "step": 1004 - }, - { - "epoch": 6.1656441717791415, - "grad_norm": 2.746947765350342, - "learning_rate": 1.6093304709779273e-06, - "loss": 0.036, - "step": 1005 - }, - { - "epoch": 6.171779141104294, - "grad_norm": 2.0104732513427734, - "learning_rate": 1.6048298986788345e-06, - "loss": 0.0216, - "step": 1006 - }, - { - "epoch": 6.177914110429448, - "grad_norm": 2.655977725982666, - "learning_rate": 1.6003326516731431e-06, - "loss": 0.024, - "step": 1007 - }, - { - "epoch": 6.184049079754601, - "grad_norm": 2.0733132362365723, - "learning_rate": 1.5958387466668015e-06, - "loss": 0.0133, - "step": 1008 - }, - { - "epoch": 6.190184049079755, - "grad_norm": 2.5398054122924805, - "learning_rate": 1.5913482003533437e-06, - "loss": 0.0331, - "step": 1009 - }, - { - "epoch": 6.196319018404908, - "grad_norm": 1.7983721494674683, - "learning_rate": 1.5868610294138264e-06, - "loss": 0.0111, - "step": 1010 - }, - { - "epoch": 6.2024539877300615, - "grad_norm": 1.7259647846221924, - "learning_rate": 1.58237725051677e-06, - "loss": 0.0112, - "step": 1011 - }, - { - "epoch": 6.208588957055214, - "grad_norm": 1.7722725868225098, - "learning_rate": 1.577896880318093e-06, - "loss": 0.0181, - "step": 1012 - }, - { - "epoch": 6.214723926380368, - "grad_norm": 3.633545398712158, - "learning_rate": 1.5734199354610513e-06, - "loss": 0.0135, - "step": 1013 - }, - { - "epoch": 6.220858895705521, - "grad_norm": 1.8951494693756104, - "learning_rate": 1.5689464325761764e-06, - "loss": 0.0163, - "step": 1014 - }, - { - "epoch": 6.226993865030675, - "grad_norm": 1.637170433998108, - "learning_rate": 1.564476388281216e-06, - "loss": 0.0068, - "step": 1015 - }, - { - "epoch": 6.233128834355828, - "grad_norm": 2.2963850498199463, - "learning_rate": 1.5600098191810682e-06, - "loss": 0.021, - "step": 1016 - }, - { - "epoch": 6.2392638036809815, - "grad_norm": 2.777996063232422, - "learning_rate": 1.555546741867722e-06, - "loss": 0.0349, - "step": 1017 - }, - { - "epoch": 6.245398773006135, - "grad_norm": 2.1580724716186523, - "learning_rate": 1.5510871729201953e-06, - "loss": 0.0626, - "step": 1018 - }, - { - "epoch": 6.251533742331288, - "grad_norm": 1.4158363342285156, - "learning_rate": 1.5466311289044755e-06, - "loss": 0.0082, - "step": 1019 - }, - { - "epoch": 6.257668711656442, - "grad_norm": 3.287564516067505, - "learning_rate": 1.5421786263734524e-06, - "loss": 0.0212, - "step": 1020 - }, - { - "epoch": 6.263803680981595, - "grad_norm": 2.4552016258239746, - "learning_rate": 1.5377296818668638e-06, - "loss": 0.0963, - "step": 1021 - }, - { - "epoch": 6.269938650306749, - "grad_norm": 1.877556562423706, - "learning_rate": 1.5332843119112285e-06, - "loss": 0.011, - "step": 1022 - }, - { - "epoch": 6.276073619631902, - "grad_norm": 3.720372438430786, - "learning_rate": 1.5288425330197864e-06, - "loss": 0.018, - "step": 1023 - }, - { - "epoch": 6.282208588957055, - "grad_norm": 1.9751925468444824, - "learning_rate": 1.5244043616924389e-06, - "loss": 0.0162, - "step": 1024 - }, - { - "epoch": 6.288343558282208, - "grad_norm": 2.5137453079223633, - "learning_rate": 1.5199698144156865e-06, - "loss": 0.0468, - "step": 1025 - }, - { - "epoch": 6.294478527607362, - "grad_norm": 2.111983299255371, - "learning_rate": 1.5155389076625663e-06, - "loss": 0.0064, - "step": 1026 - }, - { - "epoch": 6.300613496932515, - "grad_norm": 2.572223663330078, - "learning_rate": 1.5111116578925924e-06, - "loss": 0.035, - "step": 1027 - }, - { - "epoch": 6.306748466257669, - "grad_norm": 2.7881019115448, - "learning_rate": 1.5066880815516943e-06, - "loss": 0.0197, - "step": 1028 - }, - { - "epoch": 6.3128834355828225, - "grad_norm": 1.2287017107009888, - "learning_rate": 1.5022681950721565e-06, - "loss": 0.0059, - "step": 1029 - }, - { - "epoch": 6.319018404907975, - "grad_norm": 1.764028549194336, - "learning_rate": 1.4978520148725558e-06, - "loss": 0.006, - "step": 1030 - }, - { - "epoch": 6.325153374233129, - "grad_norm": 2.399787664413452, - "learning_rate": 1.4934395573577016e-06, - "loss": 0.0126, - "step": 1031 - }, - { - "epoch": 6.331288343558282, - "grad_norm": 1.9056172370910645, - "learning_rate": 1.4890308389185743e-06, - "loss": 0.0131, - "step": 1032 - }, - { - "epoch": 6.337423312883436, - "grad_norm": 1.7394744157791138, - "learning_rate": 1.484625875932265e-06, - "loss": 0.016, - "step": 1033 - }, - { - "epoch": 6.343558282208589, - "grad_norm": 4.352719306945801, - "learning_rate": 1.480224684761915e-06, - "loss": 0.1059, - "step": 1034 - }, - { - "epoch": 6.3496932515337425, - "grad_norm": 2.148385524749756, - "learning_rate": 1.4758272817566538e-06, - "loss": 0.0312, - "step": 1035 - }, - { - "epoch": 6.355828220858895, - "grad_norm": 2.483872175216675, - "learning_rate": 1.4714336832515386e-06, - "loss": 0.0215, - "step": 1036 - }, - { - "epoch": 6.361963190184049, - "grad_norm": 2.6151270866394043, - "learning_rate": 1.467043905567494e-06, - "loss": 0.0718, - "step": 1037 - }, - { - "epoch": 6.368098159509202, - "grad_norm": 2.554600954055786, - "learning_rate": 1.4626579650112533e-06, - "loss": 0.0166, - "step": 1038 - }, - { - "epoch": 6.374233128834356, - "grad_norm": 3.013974905014038, - "learning_rate": 1.4582758778752926e-06, - "loss": 0.0448, - "step": 1039 - }, - { - "epoch": 6.38036809815951, - "grad_norm": 2.1542789936065674, - "learning_rate": 1.4538976604377781e-06, - "loss": 0.0297, - "step": 1040 - }, - { - "epoch": 6.386503067484663, - "grad_norm": 3.4402377605438232, - "learning_rate": 1.449523328962496e-06, - "loss": 0.0409, - "step": 1041 - }, - { - "epoch": 6.392638036809816, - "grad_norm": 1.6200538873672485, - "learning_rate": 1.4451528996988018e-06, - "loss": 0.0127, - "step": 1042 - }, - { - "epoch": 6.398773006134969, - "grad_norm": 3.081733465194702, - "learning_rate": 1.4407863888815527e-06, - "loss": 0.0788, - "step": 1043 - }, - { - "epoch": 6.404907975460123, - "grad_norm": 1.9813143014907837, - "learning_rate": 1.436423812731051e-06, - "loss": 0.0082, - "step": 1044 - }, - { - "epoch": 6.411042944785276, - "grad_norm": 1.7354048490524292, - "learning_rate": 1.432065187452984e-06, - "loss": 0.0086, - "step": 1045 - }, - { - "epoch": 6.41717791411043, - "grad_norm": 1.8812576532363892, - "learning_rate": 1.4277105292383594e-06, - "loss": 0.04, - "step": 1046 - }, - { - "epoch": 6.423312883435583, - "grad_norm": 1.117837905883789, - "learning_rate": 1.4233598542634519e-06, - "loss": 0.0054, - "step": 1047 - }, - { - "epoch": 6.429447852760736, - "grad_norm": 1.9587867259979248, - "learning_rate": 1.4190131786897388e-06, - "loss": 0.0263, - "step": 1048 - }, - { - "epoch": 6.435582822085889, - "grad_norm": 1.2712376117706299, - "learning_rate": 1.4146705186638388e-06, - "loss": 0.0098, - "step": 1049 - }, - { - "epoch": 6.441717791411043, - "grad_norm": 2.6563849449157715, - "learning_rate": 1.410331890317457e-06, - "loss": 0.0322, - "step": 1050 - }, - { - "epoch": 6.447852760736196, - "grad_norm": 3.136518955230713, - "learning_rate": 1.4059973097673187e-06, - "loss": 0.0729, - "step": 1051 - }, - { - "epoch": 6.45398773006135, - "grad_norm": 1.3937572240829468, - "learning_rate": 1.4016667931151156e-06, - "loss": 0.0094, - "step": 1052 - }, - { - "epoch": 6.460122699386503, - "grad_norm": 1.7218928337097168, - "learning_rate": 1.3973403564474422e-06, - "loss": 0.0078, - "step": 1053 - }, - { - "epoch": 6.466257668711656, - "grad_norm": 2.35612416267395, - "learning_rate": 1.393018015835737e-06, - "loss": 0.0231, - "step": 1054 - }, - { - "epoch": 6.47239263803681, - "grad_norm": 1.96125066280365, - "learning_rate": 1.388699787336224e-06, - "loss": 0.0153, - "step": 1055 - }, - { - "epoch": 6.478527607361963, - "grad_norm": 2.1789233684539795, - "learning_rate": 1.3843856869898486e-06, - "loss": 0.0136, - "step": 1056 - }, - { - "epoch": 6.484662576687117, - "grad_norm": 3.1261701583862305, - "learning_rate": 1.3800757308222263e-06, - "loss": 0.0819, - "step": 1057 - }, - { - "epoch": 6.49079754601227, - "grad_norm": 2.93422794342041, - "learning_rate": 1.3757699348435726e-06, - "loss": 0.0658, - "step": 1058 - }, - { - "epoch": 6.4969325153374236, - "grad_norm": 2.1311776638031006, - "learning_rate": 1.3714683150486534e-06, - "loss": 0.0106, - "step": 1059 - }, - { - "epoch": 6.5030674846625764, - "grad_norm": 1.699877381324768, - "learning_rate": 1.3671708874167211e-06, - "loss": 0.0151, - "step": 1060 - }, - { - "epoch": 6.50920245398773, - "grad_norm": 1.7288825511932373, - "learning_rate": 1.3628776679114516e-06, - "loss": 0.0114, - "step": 1061 - }, - { - "epoch": 6.515337423312883, - "grad_norm": 1.8437966108322144, - "learning_rate": 1.3585886724808934e-06, - "loss": 0.0117, - "step": 1062 - }, - { - "epoch": 6.521472392638037, - "grad_norm": 3.073568344116211, - "learning_rate": 1.3543039170574022e-06, - "loss": 0.0381, - "step": 1063 - }, - { - "epoch": 6.52760736196319, - "grad_norm": 1.6069157123565674, - "learning_rate": 1.350023417557581e-06, - "loss": 0.0072, - "step": 1064 - }, - { - "epoch": 6.533742331288344, - "grad_norm": 2.48502779006958, - "learning_rate": 1.345747189882228e-06, - "loss": 0.0302, - "step": 1065 - }, - { - "epoch": 6.539877300613497, - "grad_norm": 1.6879143714904785, - "learning_rate": 1.3414752499162676e-06, - "loss": 0.0095, - "step": 1066 - }, - { - "epoch": 6.54601226993865, - "grad_norm": 2.2126848697662354, - "learning_rate": 1.3372076135287005e-06, - "loss": 0.067, - "step": 1067 - }, - { - "epoch": 6.552147239263804, - "grad_norm": 2.157269239425659, - "learning_rate": 1.33294429657254e-06, - "loss": 0.0203, - "step": 1068 - }, - { - "epoch": 6.558282208588957, - "grad_norm": 2.725158452987671, - "learning_rate": 1.3286853148847523e-06, - "loss": 0.0217, - "step": 1069 - }, - { - "epoch": 6.564417177914111, - "grad_norm": 2.478426456451416, - "learning_rate": 1.3244306842862007e-06, - "loss": 0.0223, - "step": 1070 - }, - { - "epoch": 6.570552147239264, - "grad_norm": 2.349463939666748, - "learning_rate": 1.3201804205815872e-06, - "loss": 0.027, - "step": 1071 - }, - { - "epoch": 6.576687116564417, - "grad_norm": 2.049593210220337, - "learning_rate": 1.3159345395593876e-06, - "loss": 0.0212, - "step": 1072 - }, - { - "epoch": 6.58282208588957, - "grad_norm": 2.3445141315460205, - "learning_rate": 1.3116930569918024e-06, - "loss": 0.0182, - "step": 1073 - }, - { - "epoch": 6.588957055214724, - "grad_norm": 3.756135940551758, - "learning_rate": 1.3074559886346886e-06, - "loss": 0.1187, - "step": 1074 - }, - { - "epoch": 6.595092024539877, - "grad_norm": 2.4747114181518555, - "learning_rate": 1.3032233502275089e-06, - "loss": 0.0103, - "step": 1075 - }, - { - "epoch": 6.601226993865031, - "grad_norm": 2.0029311180114746, - "learning_rate": 1.2989951574932693e-06, - "loss": 0.0115, - "step": 1076 - }, - { - "epoch": 6.6073619631901845, - "grad_norm": 2.007141351699829, - "learning_rate": 1.2947714261384602e-06, - "loss": 0.0155, - "step": 1077 - }, - { - "epoch": 6.613496932515337, - "grad_norm": 1.5075048208236694, - "learning_rate": 1.2905521718530012e-06, - "loss": 0.0125, - "step": 1078 - }, - { - "epoch": 6.61963190184049, - "grad_norm": 1.9235132932662964, - "learning_rate": 1.2863374103101784e-06, - "loss": 0.0181, - "step": 1079 - }, - { - "epoch": 6.625766871165644, - "grad_norm": 1.7235040664672852, - "learning_rate": 1.2821271571665912e-06, - "loss": 0.0102, - "step": 1080 - }, - { - "epoch": 6.631901840490798, - "grad_norm": 3.503974676132202, - "learning_rate": 1.277921428062091e-06, - "loss": 0.0969, - "step": 1081 - }, - { - "epoch": 6.638036809815951, - "grad_norm": 2.4633288383483887, - "learning_rate": 1.2737202386197222e-06, - "loss": 0.0383, - "step": 1082 - }, - { - "epoch": 6.644171779141105, - "grad_norm": 2.332341432571411, - "learning_rate": 1.2695236044456672e-06, - "loss": 0.0184, - "step": 1083 - }, - { - "epoch": 6.6503067484662575, - "grad_norm": 2.8279805183410645, - "learning_rate": 1.2653315411291867e-06, - "loss": 0.0327, - "step": 1084 - }, - { - "epoch": 6.656441717791411, - "grad_norm": 2.444810628890991, - "learning_rate": 1.2611440642425617e-06, - "loss": 0.0399, - "step": 1085 - }, - { - "epoch": 6.662576687116564, - "grad_norm": 2.9304957389831543, - "learning_rate": 1.2569611893410374e-06, - "loss": 0.0385, - "step": 1086 - }, - { - "epoch": 6.668711656441718, - "grad_norm": 2.1244678497314453, - "learning_rate": 1.2527829319627604e-06, - "loss": 0.0123, - "step": 1087 - }, - { - "epoch": 6.674846625766871, - "grad_norm": 2.129033327102661, - "learning_rate": 1.248609307628729e-06, - "loss": 0.0302, - "step": 1088 - }, - { - "epoch": 6.680981595092025, - "grad_norm": 5.788925647735596, - "learning_rate": 1.2444403318427268e-06, - "loss": 0.0296, - "step": 1089 - }, - { - "epoch": 6.6871165644171775, - "grad_norm": 5.127935886383057, - "learning_rate": 1.2402760200912725e-06, - "loss": 0.1532, - "step": 1090 - }, - { - "epoch": 6.693251533742331, - "grad_norm": 2.2610318660736084, - "learning_rate": 1.2361163878435594e-06, - "loss": 0.0126, - "step": 1091 - }, - { - "epoch": 6.699386503067485, - "grad_norm": 1.7913328409194946, - "learning_rate": 1.2319614505513953e-06, - "loss": 0.0086, - "step": 1092 - }, - { - "epoch": 6.705521472392638, - "grad_norm": 1.5961267948150635, - "learning_rate": 1.227811223649149e-06, - "loss": 0.0041, - "step": 1093 - }, - { - "epoch": 6.711656441717792, - "grad_norm": 1.441754937171936, - "learning_rate": 1.2236657225536938e-06, - "loss": 0.0103, - "step": 1094 - }, - { - "epoch": 6.717791411042945, - "grad_norm": 1.4393174648284912, - "learning_rate": 1.2195249626643432e-06, - "loss": 0.0063, - "step": 1095 - }, - { - "epoch": 6.723926380368098, - "grad_norm": 3.199451208114624, - "learning_rate": 1.2153889593628032e-06, - "loss": 0.0571, - "step": 1096 - }, - { - "epoch": 6.730061349693251, - "grad_norm": 2.1796770095825195, - "learning_rate": 1.211257728013107e-06, - "loss": 0.0269, - "step": 1097 - }, - { - "epoch": 6.736196319018405, - "grad_norm": 3.1798806190490723, - "learning_rate": 1.2071312839615634e-06, - "loss": 0.0396, - "step": 1098 - }, - { - "epoch": 6.742331288343558, - "grad_norm": 3.063633680343628, - "learning_rate": 1.2030096425366985e-06, - "loss": 0.0261, - "step": 1099 - }, - { - "epoch": 6.748466257668712, - "grad_norm": 1.860409140586853, - "learning_rate": 1.1988928190491948e-06, - "loss": 0.013, - "step": 1100 - }, - { - "epoch": 6.754601226993865, - "grad_norm": 1.9303224086761475, - "learning_rate": 1.1947808287918406e-06, - "loss": 0.0113, - "step": 1101 - }, - { - "epoch": 6.7607361963190185, - "grad_norm": 2.1432337760925293, - "learning_rate": 1.19067368703947e-06, - "loss": 0.0195, - "step": 1102 - }, - { - "epoch": 6.766871165644172, - "grad_norm": 1.8998470306396484, - "learning_rate": 1.1865714090489038e-06, - "loss": 0.0105, - "step": 1103 - }, - { - "epoch": 6.773006134969325, - "grad_norm": 2.3260247707366943, - "learning_rate": 1.1824740100588991e-06, - "loss": 0.0554, - "step": 1104 - }, - { - "epoch": 6.779141104294479, - "grad_norm": 1.9272006750106812, - "learning_rate": 1.1783815052900848e-06, - "loss": 0.0118, - "step": 1105 - }, - { - "epoch": 6.785276073619632, - "grad_norm": 3.1646785736083984, - "learning_rate": 1.1742939099449126e-06, - "loss": 0.0901, - "step": 1106 - }, - { - "epoch": 6.791411042944786, - "grad_norm": 3.357422351837158, - "learning_rate": 1.1702112392075966e-06, - "loss": 0.0833, - "step": 1107 - }, - { - "epoch": 6.7975460122699385, - "grad_norm": 1.4302526712417603, - "learning_rate": 1.1661335082440545e-06, - "loss": 0.0078, - "step": 1108 - }, - { - "epoch": 6.803680981595092, - "grad_norm": 1.3046417236328125, - "learning_rate": 1.1620607322018587e-06, - "loss": 0.0092, - "step": 1109 - }, - { - "epoch": 6.809815950920245, - "grad_norm": 2.084237813949585, - "learning_rate": 1.1579929262101712e-06, - "loss": 0.0283, - "step": 1110 - }, - { - "epoch": 6.815950920245399, - "grad_norm": 1.9403250217437744, - "learning_rate": 1.153930105379695e-06, - "loss": 0.0066, - "step": 1111 - }, - { - "epoch": 6.822085889570552, - "grad_norm": 2.282449722290039, - "learning_rate": 1.1498722848026142e-06, - "loss": 0.0402, - "step": 1112 - }, - { - "epoch": 6.828220858895706, - "grad_norm": 1.9357627630233765, - "learning_rate": 1.1458194795525354e-06, - "loss": 0.0101, - "step": 1113 - }, - { - "epoch": 6.8343558282208585, - "grad_norm": 2.0236339569091797, - "learning_rate": 1.1417717046844385e-06, - "loss": 0.0109, - "step": 1114 - }, - { - "epoch": 6.840490797546012, - "grad_norm": 2.386857032775879, - "learning_rate": 1.137728975234615e-06, - "loss": 0.0297, - "step": 1115 - }, - { - "epoch": 6.846625766871165, - "grad_norm": 2.2477970123291016, - "learning_rate": 1.1336913062206157e-06, - "loss": 0.0393, - "step": 1116 - }, - { - "epoch": 6.852760736196319, - "grad_norm": 2.7217776775360107, - "learning_rate": 1.129658712641192e-06, - "loss": 0.0269, - "step": 1117 - }, - { - "epoch": 6.858895705521473, - "grad_norm": 2.6717259883880615, - "learning_rate": 1.125631209476241e-06, - "loss": 0.0708, - "step": 1118 - }, - { - "epoch": 6.865030674846626, - "grad_norm": 2.951939344406128, - "learning_rate": 1.1216088116867524e-06, - "loss": 0.0835, - "step": 1119 - }, - { - "epoch": 6.871165644171779, - "grad_norm": 1.9705166816711426, - "learning_rate": 1.1175915342147486e-06, - "loss": 0.0107, - "step": 1120 - }, - { - "epoch": 6.877300613496932, - "grad_norm": 2.4005937576293945, - "learning_rate": 1.1135793919832336e-06, - "loss": 0.0139, - "step": 1121 - }, - { - "epoch": 6.883435582822086, - "grad_norm": 2.277463674545288, - "learning_rate": 1.1095723998961353e-06, - "loss": 0.0154, - "step": 1122 - }, - { - "epoch": 6.889570552147239, - "grad_norm": 1.5026034116744995, - "learning_rate": 1.1055705728382482e-06, - "loss": 0.0072, - "step": 1123 - }, - { - "epoch": 6.895705521472393, - "grad_norm": 1.9540379047393799, - "learning_rate": 1.1015739256751826e-06, - "loss": 0.0202, - "step": 1124 - }, - { - "epoch": 6.901840490797546, - "grad_norm": 2.3090603351593018, - "learning_rate": 1.0975824732533066e-06, - "loss": 0.0559, - "step": 1125 - }, - { - "epoch": 6.9079754601226995, - "grad_norm": 2.100283622741699, - "learning_rate": 1.09359623039969e-06, - "loss": 0.0385, - "step": 1126 - }, - { - "epoch": 6.914110429447852, - "grad_norm": 2.4120566844940186, - "learning_rate": 1.0896152119220525e-06, - "loss": 0.0535, - "step": 1127 - }, - { - "epoch": 6.920245398773006, - "grad_norm": 2.003495454788208, - "learning_rate": 1.0856394326087045e-06, - "loss": 0.0104, - "step": 1128 - }, - { - "epoch": 6.92638036809816, - "grad_norm": 1.6565535068511963, - "learning_rate": 1.0816689072284962e-06, - "loss": 0.0121, - "step": 1129 - }, - { - "epoch": 6.932515337423313, - "grad_norm": 1.6503472328186035, - "learning_rate": 1.0777036505307616e-06, - "loss": 0.0056, - "step": 1130 - }, - { - "epoch": 6.938650306748467, - "grad_norm": 2.600112199783325, - "learning_rate": 1.0737436772452602e-06, - "loss": 0.0198, - "step": 1131 - }, - { - "epoch": 6.9447852760736195, - "grad_norm": 1.6668883562088013, - "learning_rate": 1.0697890020821292e-06, - "loss": 0.0077, - "step": 1132 - }, - { - "epoch": 6.950920245398773, - "grad_norm": 2.729172706604004, - "learning_rate": 1.0658396397318203e-06, - "loss": 0.0329, - "step": 1133 - }, - { - "epoch": 6.957055214723926, - "grad_norm": 1.5219136476516724, - "learning_rate": 1.061895604865053e-06, - "loss": 0.0113, - "step": 1134 - }, - { - "epoch": 6.96319018404908, - "grad_norm": 3.8395588397979736, - "learning_rate": 1.057956912132757e-06, - "loss": 0.0376, - "step": 1135 - }, - { - "epoch": 6.969325153374233, - "grad_norm": 2.4347221851348877, - "learning_rate": 1.054023576166014e-06, - "loss": 0.0517, - "step": 1136 - }, - { - "epoch": 6.975460122699387, - "grad_norm": 3.079165458679199, - "learning_rate": 1.0500956115760105e-06, - "loss": 0.0373, - "step": 1137 - }, - { - "epoch": 6.9815950920245395, - "grad_norm": 1.9391908645629883, - "learning_rate": 1.0461730329539794e-06, - "loss": 0.019, - "step": 1138 - }, - { - "epoch": 6.987730061349693, - "grad_norm": 1.8693119287490845, - "learning_rate": 1.0422558548711434e-06, - "loss": 0.0073, - "step": 1139 - }, - { - "epoch": 6.993865030674847, - "grad_norm": 3.0920307636260986, - "learning_rate": 1.0383440918786684e-06, - "loss": 0.0099, - "step": 1140 - }, - { - "epoch": 7.0, - "grad_norm": 3.184906244277954, - "learning_rate": 1.0344377585076e-06, - "loss": 0.0218, - "step": 1141 - }, - { - "epoch": 7.006134969325154, - "grad_norm": 0.7609673142433167, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.0024, - "step": 1142 - }, - { - "epoch": 7.012269938650307, - "grad_norm": 1.1493247747421265, - "learning_rate": 1.0266414386529775e-06, - "loss": 0.0059, - "step": 1143 - }, - { - "epoch": 7.0184049079754605, - "grad_norm": 3.534796953201294, - "learning_rate": 1.0227514811304556e-06, - "loss": 0.0843, - "step": 1144 - }, - { - "epoch": 7.024539877300613, - "grad_norm": 1.1876507997512817, - "learning_rate": 1.0188670111513002e-06, - "loss": 0.0098, - "step": 1145 - }, - { - "epoch": 7.030674846625767, - "grad_norm": 1.2825753688812256, - "learning_rate": 1.0149880431451736e-06, - "loss": 0.0042, - "step": 1146 - }, - { - "epoch": 7.03680981595092, - "grad_norm": 0.6842563152313232, - "learning_rate": 1.0111145915213e-06, - "loss": 0.003, - "step": 1147 - }, - { - "epoch": 7.042944785276074, - "grad_norm": 0.6310113072395325, - "learning_rate": 1.0072466706684127e-06, - "loss": 0.0027, - "step": 1148 - }, - { - "epoch": 7.049079754601227, - "grad_norm": 1.484761357307434, - "learning_rate": 1.0033842949546974e-06, - "loss": 0.0105, - "step": 1149 - }, - { - "epoch": 7.0552147239263805, - "grad_norm": 1.9790291786193848, - "learning_rate": 9.995274787277445e-07, - "loss": 0.0233, - "step": 1150 - }, - { - "epoch": 7.061349693251533, - "grad_norm": 1.1398522853851318, - "learning_rate": 9.956762363144892e-07, - "loss": 0.0031, - "step": 1151 - }, - { - "epoch": 7.067484662576687, - "grad_norm": 1.0574359893798828, - "learning_rate": 9.918305820211643e-07, - "loss": 0.0047, - "step": 1152 - }, - { - "epoch": 7.07361963190184, - "grad_norm": 2.463972330093384, - "learning_rate": 9.879905301332439e-07, - "loss": 0.0334, - "step": 1153 - }, - { - "epoch": 7.079754601226994, - "grad_norm": 1.4698575735092163, - "learning_rate": 9.84156094915389e-07, - "loss": 0.0191, - "step": 1154 - }, - { - "epoch": 7.085889570552148, - "grad_norm": 1.2635239362716675, - "learning_rate": 9.803272906113978e-07, - "loss": 0.0045, - "step": 1155 - }, - { - "epoch": 7.0920245398773005, - "grad_norm": 1.7271842956542969, - "learning_rate": 9.765041314441529e-07, - "loss": 0.0042, - "step": 1156 - }, - { - "epoch": 7.098159509202454, - "grad_norm": 1.5738918781280518, - "learning_rate": 9.72686631615563e-07, - "loss": 0.0066, - "step": 1157 - }, - { - "epoch": 7.104294478527607, - "grad_norm": 1.3097981214523315, - "learning_rate": 9.688748053065179e-07, - "loss": 0.0058, - "step": 1158 - }, - { - "epoch": 7.110429447852761, - "grad_norm": 2.076064348220825, - "learning_rate": 9.65068666676828e-07, - "loss": 0.0067, - "step": 1159 - }, - { - "epoch": 7.116564417177914, - "grad_norm": 1.1589064598083496, - "learning_rate": 9.612682298651792e-07, - "loss": 0.0052, - "step": 1160 - }, - { - "epoch": 7.122699386503068, - "grad_norm": 1.6450324058532715, - "learning_rate": 9.574735089890765e-07, - "loss": 0.0035, - "step": 1161 - }, - { - "epoch": 7.128834355828221, - "grad_norm": 1.6968387365341187, - "learning_rate": 9.53684518144789e-07, - "loss": 0.0126, - "step": 1162 - }, - { - "epoch": 7.134969325153374, - "grad_norm": 1.9047832489013672, - "learning_rate": 9.499012714073036e-07, - "loss": 0.0345, - "step": 1163 - }, - { - "epoch": 7.141104294478527, - "grad_norm": 1.7587796449661255, - "learning_rate": 9.461237828302666e-07, - "loss": 0.0144, - "step": 1164 - }, - { - "epoch": 7.147239263803681, - "grad_norm": 1.863775372505188, - "learning_rate": 9.423520664459374e-07, - "loss": 0.0135, - "step": 1165 - }, - { - "epoch": 7.153374233128835, - "grad_norm": 2.6580259799957275, - "learning_rate": 9.385861362651322e-07, - "loss": 0.0138, - "step": 1166 - }, - { - "epoch": 7.159509202453988, - "grad_norm": 2.086371421813965, - "learning_rate": 9.348260062771713e-07, - "loss": 0.0093, - "step": 1167 - }, - { - "epoch": 7.1656441717791415, - "grad_norm": 1.0806611776351929, - "learning_rate": 9.310716904498321e-07, - "loss": 0.003, - "step": 1168 - }, - { - "epoch": 7.171779141104294, - "grad_norm": 1.2487165927886963, - "learning_rate": 9.273232027292933e-07, - "loss": 0.0033, - "step": 1169 - }, - { - "epoch": 7.177914110429448, - "grad_norm": 1.0647703409194946, - "learning_rate": 9.235805570400813e-07, - "loss": 0.0024, - "step": 1170 - }, - { - "epoch": 7.184049079754601, - "grad_norm": 1.6039917469024658, - "learning_rate": 9.198437672850249e-07, - "loss": 0.0118, - "step": 1171 - }, - { - "epoch": 7.190184049079755, - "grad_norm": 2.199977159500122, - "learning_rate": 9.161128473451967e-07, - "loss": 0.0173, - "step": 1172 - }, - { - "epoch": 7.196319018404908, - "grad_norm": 2.51725697517395, - "learning_rate": 9.123878110798662e-07, - "loss": 0.0142, - "step": 1173 - }, - { - "epoch": 7.2024539877300615, - "grad_norm": 1.841742753982544, - "learning_rate": 9.086686723264474e-07, - "loss": 0.012, - "step": 1174 - }, - { - "epoch": 7.208588957055214, - "grad_norm": 1.212876319885254, - "learning_rate": 9.049554449004447e-07, - "loss": 0.0055, - "step": 1175 - }, - { - "epoch": 7.214723926380368, - "grad_norm": 1.3728275299072266, - "learning_rate": 9.012481425954053e-07, - "loss": 0.0043, - "step": 1176 - }, - { - "epoch": 7.220858895705521, - "grad_norm": 2.3055357933044434, - "learning_rate": 8.97546779182866e-07, - "loss": 0.0443, - "step": 1177 - }, - { - "epoch": 7.226993865030675, - "grad_norm": 2.017620801925659, - "learning_rate": 8.938513684123024e-07, - "loss": 0.0082, - "step": 1178 - }, - { - "epoch": 7.233128834355828, - "grad_norm": 1.5641282796859741, - "learning_rate": 8.901619240110781e-07, - "loss": 0.0071, - "step": 1179 - }, - { - "epoch": 7.2392638036809815, - "grad_norm": 1.3781960010528564, - "learning_rate": 8.864784596843917e-07, - "loss": 0.0056, - "step": 1180 - }, - { - "epoch": 7.245398773006135, - "grad_norm": 1.23178231716156, - "learning_rate": 8.828009891152301e-07, - "loss": 0.0076, - "step": 1181 - }, - { - "epoch": 7.251533742331288, - "grad_norm": 2.809582233428955, - "learning_rate": 8.791295259643126e-07, - "loss": 0.0141, - "step": 1182 - }, - { - "epoch": 7.257668711656442, - "grad_norm": 1.6520317792892456, - "learning_rate": 8.754640838700443e-07, - "loss": 0.01, - "step": 1183 - }, - { - "epoch": 7.263803680981595, - "grad_norm": 1.411852478981018, - "learning_rate": 8.718046764484648e-07, - "loss": 0.009, - "step": 1184 - }, - { - "epoch": 7.269938650306749, - "grad_norm": 2.9334425926208496, - "learning_rate": 8.681513172931935e-07, - "loss": 0.0291, - "step": 1185 - }, - { - "epoch": 7.276073619631902, - "grad_norm": 1.4273028373718262, - "learning_rate": 8.64504019975386e-07, - "loss": 0.0064, - "step": 1186 - }, - { - "epoch": 7.282208588957055, - "grad_norm": 1.9486448764801025, - "learning_rate": 8.608627980436765e-07, - "loss": 0.0135, - "step": 1187 - }, - { - "epoch": 7.288343558282208, - "grad_norm": 1.3740493059158325, - "learning_rate": 8.572276650241329e-07, - "loss": 0.0061, - "step": 1188 - }, - { - "epoch": 7.294478527607362, - "grad_norm": 1.3352797031402588, - "learning_rate": 8.535986344202057e-07, - "loss": 0.0051, - "step": 1189 - }, - { - "epoch": 7.300613496932515, - "grad_norm": 1.0336774587631226, - "learning_rate": 8.499757197126732e-07, - "loss": 0.0052, - "step": 1190 - }, - { - "epoch": 7.306748466257669, - "grad_norm": 1.1450837850570679, - "learning_rate": 8.463589343595976e-07, - "loss": 0.0111, - "step": 1191 - }, - { - "epoch": 7.3128834355828225, - "grad_norm": 2.504876136779785, - "learning_rate": 8.427482917962734e-07, - "loss": 0.0279, - "step": 1192 - }, - { - "epoch": 7.319018404907975, - "grad_norm": 1.569841980934143, - "learning_rate": 8.391438054351725e-07, - "loss": 0.0105, - "step": 1193 - }, - { - "epoch": 7.325153374233129, - "grad_norm": 1.218538761138916, - "learning_rate": 8.355454886659026e-07, - "loss": 0.0028, - "step": 1194 - }, - { - "epoch": 7.331288343558282, - "grad_norm": 2.084049940109253, - "learning_rate": 8.319533548551492e-07, - "loss": 0.0102, - "step": 1195 - }, - { - "epoch": 7.337423312883436, - "grad_norm": 2.326167345046997, - "learning_rate": 8.28367417346633e-07, - "loss": 0.0396, - "step": 1196 - }, - { - "epoch": 7.343558282208589, - "grad_norm": 1.2704310417175293, - "learning_rate": 8.247876894610568e-07, - "loss": 0.006, - "step": 1197 - }, - { - "epoch": 7.3496932515337425, - "grad_norm": 1.358012318611145, - "learning_rate": 8.212141844960544e-07, - "loss": 0.0075, - "step": 1198 - }, - { - "epoch": 7.355828220858895, - "grad_norm": 1.5145729780197144, - "learning_rate": 8.17646915726146e-07, - "loss": 0.0042, - "step": 1199 - }, - { - "epoch": 7.361963190184049, - "grad_norm": 1.203041911125183, - "learning_rate": 8.140858964026849e-07, - "loss": 0.0032, - "step": 1200 - }, - { - "epoch": 7.368098159509202, - "grad_norm": 3.031280279159546, - "learning_rate": 8.105311397538085e-07, - "loss": 0.032, - "step": 1201 - }, - { - "epoch": 7.374233128834356, - "grad_norm": 1.416698694229126, - "learning_rate": 8.069826589843929e-07, - "loss": 0.0185, - "step": 1202 - }, - { - "epoch": 7.38036809815951, - "grad_norm": 0.9656457901000977, - "learning_rate": 8.034404672759977e-07, - "loss": 0.0034, - "step": 1203 - }, - { - "epoch": 7.386503067484663, - "grad_norm": 1.7239291667938232, - "learning_rate": 7.99904577786823e-07, - "loss": 0.034, - "step": 1204 - }, - { - "epoch": 7.392638036809816, - "grad_norm": 1.1560636758804321, - "learning_rate": 7.963750036516585e-07, - "loss": 0.005, - "step": 1205 - }, - { - "epoch": 7.398773006134969, - "grad_norm": 1.057456374168396, - "learning_rate": 7.928517579818312e-07, - "loss": 0.0073, - "step": 1206 - }, - { - "epoch": 7.404907975460123, - "grad_norm": 1.4066674709320068, - "learning_rate": 7.893348538651635e-07, - "loss": 0.015, - "step": 1207 - }, - { - "epoch": 7.411042944785276, - "grad_norm": 1.1061445474624634, - "learning_rate": 7.858243043659161e-07, - "loss": 0.004, - "step": 1208 - }, - { - "epoch": 7.41717791411043, - "grad_norm": 0.9575282335281372, - "learning_rate": 7.823201225247496e-07, - "loss": 0.003, - "step": 1209 - }, - { - "epoch": 7.423312883435583, - "grad_norm": 1.3790507316589355, - "learning_rate": 7.788223213586677e-07, - "loss": 0.0096, - "step": 1210 - }, - { - "epoch": 7.429447852760736, - "grad_norm": 1.1366883516311646, - "learning_rate": 7.753309138609705e-07, - "loss": 0.006, - "step": 1211 - }, - { - "epoch": 7.435582822085889, - "grad_norm": 2.2659928798675537, - "learning_rate": 7.71845913001211e-07, - "loss": 0.0074, - "step": 1212 - }, - { - "epoch": 7.441717791411043, - "grad_norm": 1.2541831731796265, - "learning_rate": 7.683673317251392e-07, - "loss": 0.0051, - "step": 1213 - }, - { - "epoch": 7.447852760736196, - "grad_norm": 1.5959513187408447, - "learning_rate": 7.648951829546619e-07, - "loss": 0.0271, - "step": 1214 - }, - { - "epoch": 7.45398773006135, - "grad_norm": 1.368452548980713, - "learning_rate": 7.6142947958779e-07, - "loss": 0.0155, - "step": 1215 - }, - { - "epoch": 7.460122699386503, - "grad_norm": 1.1851825714111328, - "learning_rate": 7.579702344985899e-07, - "loss": 0.0032, - "step": 1216 - }, - { - "epoch": 7.466257668711656, - "grad_norm": 1.419812560081482, - "learning_rate": 7.545174605371403e-07, - "loss": 0.0037, - "step": 1217 - }, - { - "epoch": 7.47239263803681, - "grad_norm": 1.0817372798919678, - "learning_rate": 7.510711705294782e-07, - "loss": 0.0064, - "step": 1218 - }, - { - "epoch": 7.478527607361963, - "grad_norm": 1.0459797382354736, - "learning_rate": 7.476313772775578e-07, - "loss": 0.0055, - "step": 1219 - }, - { - "epoch": 7.484662576687117, - "grad_norm": 1.4481663703918457, - "learning_rate": 7.441980935591986e-07, - "loss": 0.0049, - "step": 1220 - }, - { - "epoch": 7.49079754601227, - "grad_norm": 1.7337101697921753, - "learning_rate": 7.407713321280377e-07, - "loss": 0.0123, - "step": 1221 - }, - { - "epoch": 7.4969325153374236, - "grad_norm": 1.3378303050994873, - "learning_rate": 7.373511057134855e-07, - "loss": 0.0056, - "step": 1222 - }, - { - "epoch": 7.5030674846625764, - "grad_norm": 2.4353835582733154, - "learning_rate": 7.339374270206772e-07, - "loss": 0.0155, - "step": 1223 - }, - { - "epoch": 7.50920245398773, - "grad_norm": 2.2856571674346924, - "learning_rate": 7.305303087304227e-07, - "loss": 0.0303, - "step": 1224 - }, - { - "epoch": 7.515337423312883, - "grad_norm": 1.0627055168151855, - "learning_rate": 7.271297634991651e-07, - "loss": 0.0018, - "step": 1225 - }, - { - "epoch": 7.521472392638037, - "grad_norm": 1.2120238542556763, - "learning_rate": 7.237358039589271e-07, - "loss": 0.0064, - "step": 1226 - }, - { - "epoch": 7.52760736196319, - "grad_norm": 1.1861765384674072, - "learning_rate": 7.203484427172702e-07, - "loss": 0.0025, - "step": 1227 - }, - { - "epoch": 7.533742331288344, - "grad_norm": 1.6700332164764404, - "learning_rate": 7.169676923572447e-07, - "loss": 0.0067, - "step": 1228 - }, - { - "epoch": 7.539877300613497, - "grad_norm": 1.4527982473373413, - "learning_rate": 7.135935654373416e-07, - "loss": 0.0082, - "step": 1229 - }, - { - "epoch": 7.54601226993865, - "grad_norm": 1.1425046920776367, - "learning_rate": 7.102260744914499e-07, - "loss": 0.0042, - "step": 1230 - }, - { - "epoch": 7.552147239263804, - "grad_norm": 2.0762295722961426, - "learning_rate": 7.068652320288081e-07, - "loss": 0.0374, - "step": 1231 - }, - { - "epoch": 7.558282208588957, - "grad_norm": 1.2008321285247803, - "learning_rate": 7.035110505339546e-07, - "loss": 0.0022, - "step": 1232 - }, - { - "epoch": 7.564417177914111, - "grad_norm": 1.262100338935852, - "learning_rate": 7.001635424666878e-07, - "loss": 0.006, - "step": 1233 - }, - { - "epoch": 7.570552147239264, - "grad_norm": 1.8173811435699463, - "learning_rate": 6.968227202620137e-07, - "loss": 0.0137, - "step": 1234 - }, - { - "epoch": 7.576687116564417, - "grad_norm": 1.6977999210357666, - "learning_rate": 6.934885963301033e-07, - "loss": 0.0216, - "step": 1235 - }, - { - "epoch": 7.58282208588957, - "grad_norm": 0.7084318399429321, - "learning_rate": 6.901611830562469e-07, - "loss": 0.0027, - "step": 1236 - }, - { - "epoch": 7.588957055214724, - "grad_norm": 2.0332374572753906, - "learning_rate": 6.868404928008035e-07, - "loss": 0.0391, - "step": 1237 - }, - { - "epoch": 7.595092024539877, - "grad_norm": 1.235734224319458, - "learning_rate": 6.835265378991613e-07, - "loss": 0.0053, - "step": 1238 - }, - { - "epoch": 7.601226993865031, - "grad_norm": 2.687920331954956, - "learning_rate": 6.802193306616858e-07, - "loss": 0.0395, - "step": 1239 - }, - { - "epoch": 7.6073619631901845, - "grad_norm": 1.4211101531982422, - "learning_rate": 6.769188833736781e-07, - "loss": 0.0055, - "step": 1240 - }, - { - "epoch": 7.613496932515337, - "grad_norm": 2.4542644023895264, - "learning_rate": 6.736252082953307e-07, - "loss": 0.0072, - "step": 1241 - }, - { - "epoch": 7.61963190184049, - "grad_norm": 1.2946943044662476, - "learning_rate": 6.703383176616743e-07, - "loss": 0.0046, - "step": 1242 - }, - { - "epoch": 7.625766871165644, - "grad_norm": 3.8073277473449707, - "learning_rate": 6.670582236825421e-07, - "loss": 0.0742, - "step": 1243 - }, - { - "epoch": 7.631901840490798, - "grad_norm": 1.4291348457336426, - "learning_rate": 6.637849385425157e-07, - "loss": 0.0069, - "step": 1244 - }, - { - "epoch": 7.638036809815951, - "grad_norm": 1.1767655611038208, - "learning_rate": 6.605184744008866e-07, - "loss": 0.0031, - "step": 1245 - }, - { - "epoch": 7.644171779141105, - "grad_norm": 1.837077260017395, - "learning_rate": 6.572588433916082e-07, - "loss": 0.0316, - "step": 1246 - }, - { - "epoch": 7.6503067484662575, - "grad_norm": 1.9157041311264038, - "learning_rate": 6.540060576232488e-07, - "loss": 0.0472, - "step": 1247 - }, - { - "epoch": 7.656441717791411, - "grad_norm": 1.7347630262374878, - "learning_rate": 6.507601291789515e-07, - "loss": 0.0059, - "step": 1248 - }, - { - "epoch": 7.662576687116564, - "grad_norm": 0.9757588505744934, - "learning_rate": 6.475210701163828e-07, - "loss": 0.0023, - "step": 1249 - }, - { - "epoch": 7.668711656441718, - "grad_norm": 1.9460281133651733, - "learning_rate": 6.442888924676951e-07, - "loss": 0.0207, - "step": 1250 - }, - { - "epoch": 7.674846625766871, - "grad_norm": 0.7517938613891602, - "learning_rate": 6.410636082394772e-07, - "loss": 0.002, - "step": 1251 - }, - { - "epoch": 7.680981595092025, - "grad_norm": 1.0631566047668457, - "learning_rate": 6.378452294127091e-07, - "loss": 0.0038, - "step": 1252 - }, - { - "epoch": 7.6871165644171775, - "grad_norm": 0.9524463415145874, - "learning_rate": 6.346337679427214e-07, - "loss": 0.0024, - "step": 1253 - }, - { - "epoch": 7.693251533742331, - "grad_norm": 1.3653123378753662, - "learning_rate": 6.314292357591489e-07, - "loss": 0.0027, - "step": 1254 - }, - { - "epoch": 7.699386503067485, - "grad_norm": 1.2446377277374268, - "learning_rate": 6.282316447658837e-07, - "loss": 0.0048, - "step": 1255 - }, - { - "epoch": 7.705521472392638, - "grad_norm": 1.716244101524353, - "learning_rate": 6.250410068410367e-07, - "loss": 0.0064, - "step": 1256 - }, - { - "epoch": 7.711656441717792, - "grad_norm": 1.7151219844818115, - "learning_rate": 6.218573338368869e-07, - "loss": 0.0056, - "step": 1257 - }, - { - "epoch": 7.717791411042945, - "grad_norm": 1.8013248443603516, - "learning_rate": 6.186806375798429e-07, - "loss": 0.0073, - "step": 1258 - }, - { - "epoch": 7.723926380368098, - "grad_norm": 1.051620602607727, - "learning_rate": 6.155109298703968e-07, - "loss": 0.0043, - "step": 1259 - }, - { - "epoch": 7.730061349693251, - "grad_norm": 1.5731337070465088, - "learning_rate": 6.123482224830787e-07, - "loss": 0.0108, - "step": 1260 - }, - { - "epoch": 7.736196319018405, - "grad_norm": 2.232144832611084, - "learning_rate": 6.091925271664156e-07, - "loss": 0.0337, - "step": 1261 - }, - { - "epoch": 7.742331288343558, - "grad_norm": 1.072678565979004, - "learning_rate": 6.060438556428877e-07, - "loss": 0.0019, - "step": 1262 - }, - { - "epoch": 7.748466257668712, - "grad_norm": 2.3631110191345215, - "learning_rate": 6.02902219608881e-07, - "loss": 0.0089, - "step": 1263 - }, - { - "epoch": 7.754601226993865, - "grad_norm": 1.1171438694000244, - "learning_rate": 5.997676307346504e-07, - "loss": 0.0045, - "step": 1264 - }, - { - "epoch": 7.7607361963190185, - "grad_norm": 0.7839979529380798, - "learning_rate": 5.966401006642689e-07, - "loss": 0.0028, - "step": 1265 - }, - { - "epoch": 7.766871165644172, - "grad_norm": 1.5938968658447266, - "learning_rate": 5.93519641015591e-07, - "loss": 0.009, - "step": 1266 - }, - { - "epoch": 7.773006134969325, - "grad_norm": 1.2980104684829712, - "learning_rate": 5.904062633802066e-07, - "loss": 0.0168, - "step": 1267 - }, - { - "epoch": 7.779141104294479, - "grad_norm": 1.177626371383667, - "learning_rate": 5.872999793233952e-07, - "loss": 0.0029, - "step": 1268 - }, - { - "epoch": 7.785276073619632, - "grad_norm": 2.0138931274414062, - "learning_rate": 5.842008003840891e-07, - "loss": 0.015, - "step": 1269 - }, - { - "epoch": 7.791411042944786, - "grad_norm": 1.7204387187957764, - "learning_rate": 5.811087380748245e-07, - "loss": 0.011, - "step": 1270 - }, - { - "epoch": 7.7975460122699385, - "grad_norm": 1.506241798400879, - "learning_rate": 5.780238038817035e-07, - "loss": 0.0057, - "step": 1271 - }, - { - "epoch": 7.803680981595092, - "grad_norm": 2.0950393676757812, - "learning_rate": 5.74946009264348e-07, - "loss": 0.0131, - "step": 1272 - }, - { - "epoch": 7.809815950920245, - "grad_norm": 2.1451432704925537, - "learning_rate": 5.71875365655859e-07, - "loss": 0.0088, - "step": 1273 - }, - { - "epoch": 7.815950920245399, - "grad_norm": 0.9690236449241638, - "learning_rate": 5.688118844627746e-07, - "loss": 0.0033, - "step": 1274 - }, - { - "epoch": 7.822085889570552, - "grad_norm": 1.5690608024597168, - "learning_rate": 5.657555770650241e-07, - "loss": 0.0206, - "step": 1275 - }, - { - "epoch": 7.828220858895706, - "grad_norm": 1.8220988512039185, - "learning_rate": 5.627064548158903e-07, - "loss": 0.0096, - "step": 1276 - }, - { - "epoch": 7.8343558282208585, - "grad_norm": 2.3800559043884277, - "learning_rate": 5.596645290419653e-07, - "loss": 0.008, - "step": 1277 - }, - { - "epoch": 7.840490797546012, - "grad_norm": 0.7775714993476868, - "learning_rate": 5.566298110431068e-07, - "loss": 0.0016, - "step": 1278 - }, - { - "epoch": 7.846625766871165, - "grad_norm": 1.1196876764297485, - "learning_rate": 5.536023120924e-07, - "loss": 0.0033, - "step": 1279 - }, - { - "epoch": 7.852760736196319, - "grad_norm": 1.3722344636917114, - "learning_rate": 5.505820434361108e-07, - "loss": 0.0084, - "step": 1280 - }, - { - "epoch": 7.858895705521473, - "grad_norm": 1.2068676948547363, - "learning_rate": 5.47569016293649e-07, - "loss": 0.0049, - "step": 1281 - }, - { - "epoch": 7.865030674846626, - "grad_norm": 1.096085548400879, - "learning_rate": 5.445632418575239e-07, - "loss": 0.0019, - "step": 1282 - }, - { - "epoch": 7.871165644171779, - "grad_norm": 1.3178106546401978, - "learning_rate": 5.415647312933015e-07, - "loss": 0.0062, - "step": 1283 - }, - { - "epoch": 7.877300613496932, - "grad_norm": 1.2884724140167236, - "learning_rate": 5.385734957395664e-07, - "loss": 0.0081, - "step": 1284 - }, - { - "epoch": 7.883435582822086, - "grad_norm": 0.9866589307785034, - "learning_rate": 5.355895463078789e-07, - "loss": 0.0048, - "step": 1285 - }, - { - "epoch": 7.889570552147239, - "grad_norm": 1.5396437644958496, - "learning_rate": 5.326128940827313e-07, - "loss": 0.0088, - "step": 1286 - }, - { - "epoch": 7.895705521472393, - "grad_norm": 1.1183607578277588, - "learning_rate": 5.296435501215116e-07, - "loss": 0.0043, - "step": 1287 - }, - { - "epoch": 7.901840490797546, - "grad_norm": 1.5337073802947998, - "learning_rate": 5.266815254544572e-07, - "loss": 0.0099, - "step": 1288 - }, - { - "epoch": 7.9079754601226995, - "grad_norm": 1.8188867568969727, - "learning_rate": 5.237268310846183e-07, - "loss": 0.0086, - "step": 1289 - }, - { - "epoch": 7.914110429447852, - "grad_norm": 1.972072720527649, - "learning_rate": 5.207794779878156e-07, - "loss": 0.0442, - "step": 1290 - }, - { - "epoch": 7.920245398773006, - "grad_norm": 1.1226261854171753, - "learning_rate": 5.178394771125969e-07, - "loss": 0.0071, - "step": 1291 - }, - { - "epoch": 7.92638036809816, - "grad_norm": 1.5612869262695312, - "learning_rate": 5.149068393802009e-07, - "loss": 0.0192, - "step": 1292 - }, - { - "epoch": 7.932515337423313, - "grad_norm": 1.1532280445098877, - "learning_rate": 5.119815756845123e-07, - "loss": 0.0032, - "step": 1293 - }, - { - "epoch": 7.938650306748467, - "grad_norm": 1.8807255029678345, - "learning_rate": 5.090636968920252e-07, - "loss": 0.0139, - "step": 1294 - }, - { - "epoch": 7.9447852760736195, - "grad_norm": 1.3027002811431885, - "learning_rate": 5.061532138418013e-07, - "loss": 0.0071, - "step": 1295 - }, - { - "epoch": 7.950920245398773, - "grad_norm": 1.584154486656189, - "learning_rate": 5.032501373454266e-07, - "loss": 0.0056, - "step": 1296 - }, - { - "epoch": 7.957055214723926, - "grad_norm": 1.7631733417510986, - "learning_rate": 5.003544781869762e-07, - "loss": 0.0239, - "step": 1297 - }, - { - "epoch": 7.96319018404908, - "grad_norm": 1.9462637901306152, - "learning_rate": 4.974662471229727e-07, - "loss": 0.0336, - "step": 1298 - }, - { - "epoch": 7.969325153374233, - "grad_norm": 1.9697695970535278, - "learning_rate": 4.945854548823425e-07, - "loss": 0.0049, - "step": 1299 - }, - { - "epoch": 7.975460122699387, - "grad_norm": 1.066036581993103, - "learning_rate": 4.917121121663823e-07, - "loss": 0.0103, - "step": 1300 - }, - { - "epoch": 7.9815950920245395, - "grad_norm": 1.0865890979766846, - "learning_rate": 4.888462296487129e-07, - "loss": 0.0036, - "step": 1301 - }, - { - "epoch": 7.987730061349693, - "grad_norm": 1.7804820537567139, - "learning_rate": 4.859878179752448e-07, - "loss": 0.0119, - "step": 1302 - }, - { - "epoch": 7.993865030674847, - "grad_norm": 2.735875129699707, - "learning_rate": 4.83136887764136e-07, - "loss": 0.0365, - "step": 1303 - }, - { - "epoch": 8.0, - "grad_norm": 1.316243290901184, - "learning_rate": 4.802934496057527e-07, - "loss": 0.0046, - "step": 1304 - }, - { - "epoch": 8.006134969325153, - "grad_norm": 2.192969560623169, - "learning_rate": 4.774575140626317e-07, - "loss": 0.0235, - "step": 1305 - }, - { - "epoch": 8.012269938650308, - "grad_norm": 0.9257994890213013, - "learning_rate": 4.746290916694368e-07, - "loss": 0.0029, - "step": 1306 - }, - { - "epoch": 8.01840490797546, - "grad_norm": 0.6933830380439758, - "learning_rate": 4.71808192932926e-07, - "loss": 0.0019, - "step": 1307 - }, - { - "epoch": 8.024539877300613, - "grad_norm": 0.4838462173938751, - "learning_rate": 4.6899482833190765e-07, - "loss": 0.0024, - "step": 1308 - }, - { - "epoch": 8.030674846625766, - "grad_norm": 1.1725589036941528, - "learning_rate": 4.661890083172019e-07, - "loss": 0.0166, - "step": 1309 - }, - { - "epoch": 8.036809815950921, - "grad_norm": 0.7732264399528503, - "learning_rate": 4.633907433116053e-07, - "loss": 0.0047, - "step": 1310 - }, - { - "epoch": 8.042944785276074, - "grad_norm": 0.6369810700416565, - "learning_rate": 4.6060004370984763e-07, - "loss": 0.0013, - "step": 1311 - }, - { - "epoch": 8.049079754601227, - "grad_norm": 0.6437183618545532, - "learning_rate": 4.5781691987855676e-07, - "loss": 0.0016, - "step": 1312 - }, - { - "epoch": 8.05521472392638, - "grad_norm": 0.40145647525787354, - "learning_rate": 4.5504138215621915e-07, - "loss": 0.0026, - "step": 1313 - }, - { - "epoch": 8.061349693251534, - "grad_norm": 1.1000946760177612, - "learning_rate": 4.5227344085313873e-07, - "loss": 0.002, - "step": 1314 - }, - { - "epoch": 8.067484662576687, - "grad_norm": 1.4580782651901245, - "learning_rate": 4.495131062514038e-07, - "loss": 0.0299, - "step": 1315 - }, - { - "epoch": 8.07361963190184, - "grad_norm": 0.9026187062263489, - "learning_rate": 4.467603886048452e-07, - "loss": 0.003, - "step": 1316 - }, - { - "epoch": 8.079754601226995, - "grad_norm": 1.2969629764556885, - "learning_rate": 4.440152981389972e-07, - "loss": 0.0129, - "step": 1317 - }, - { - "epoch": 8.085889570552148, - "grad_norm": 0.837665319442749, - "learning_rate": 4.412778450510641e-07, - "loss": 0.0086, - "step": 1318 - }, - { - "epoch": 8.0920245398773, - "grad_norm": 0.3426748216152191, - "learning_rate": 4.3854803950987736e-07, - "loss": 0.002, - "step": 1319 - }, - { - "epoch": 8.098159509202453, - "grad_norm": 0.8508721590042114, - "learning_rate": 4.358258916558611e-07, - "loss": 0.0016, - "step": 1320 - }, - { - "epoch": 8.104294478527608, - "grad_norm": 1.2476134300231934, - "learning_rate": 4.331114116009938e-07, - "loss": 0.0156, - "step": 1321 - }, - { - "epoch": 8.110429447852761, - "grad_norm": 1.036689281463623, - "learning_rate": 4.3040460942876896e-07, - "loss": 0.0021, - "step": 1322 - }, - { - "epoch": 8.116564417177914, - "grad_norm": 0.7747099995613098, - "learning_rate": 4.277054951941609e-07, - "loss": 0.0021, - "step": 1323 - }, - { - "epoch": 8.122699386503067, - "grad_norm": 1.2793506383895874, - "learning_rate": 4.250140789235829e-07, - "loss": 0.0036, - "step": 1324 - }, - { - "epoch": 8.128834355828221, - "grad_norm": 1.5389785766601562, - "learning_rate": 4.223303706148549e-07, - "loss": 0.0031, - "step": 1325 - }, - { - "epoch": 8.134969325153374, - "grad_norm": 1.549869179725647, - "learning_rate": 4.196543802371641e-07, - "loss": 0.0102, - "step": 1326 - }, - { - "epoch": 8.141104294478527, - "grad_norm": 0.862311065196991, - "learning_rate": 4.1698611773102525e-07, - "loss": 0.0023, - "step": 1327 - }, - { - "epoch": 8.14723926380368, - "grad_norm": 1.0216046571731567, - "learning_rate": 4.14325593008249e-07, - "loss": 0.0074, - "step": 1328 - }, - { - "epoch": 8.153374233128835, - "grad_norm": 0.8307499289512634, - "learning_rate": 4.1167281595190206e-07, - "loss": 0.0017, - "step": 1329 - }, - { - "epoch": 8.159509202453988, - "grad_norm": 0.5344944596290588, - "learning_rate": 4.090277964162692e-07, - "loss": 0.0013, - "step": 1330 - }, - { - "epoch": 8.16564417177914, - "grad_norm": 0.8608856201171875, - "learning_rate": 4.063905442268201e-07, - "loss": 0.0014, - "step": 1331 - }, - { - "epoch": 8.171779141104295, - "grad_norm": 0.33019620180130005, - "learning_rate": 4.037610691801694e-07, - "loss": 0.0009, - "step": 1332 - }, - { - "epoch": 8.177914110429448, - "grad_norm": 0.6515982747077942, - "learning_rate": 4.011393810440431e-07, - "loss": 0.0022, - "step": 1333 - }, - { - "epoch": 8.184049079754601, - "grad_norm": 0.9144461750984192, - "learning_rate": 3.985254895572413e-07, - "loss": 0.0024, - "step": 1334 - }, - { - "epoch": 8.190184049079754, - "grad_norm": 0.4078105390071869, - "learning_rate": 3.959194044296011e-07, - "loss": 0.0011, - "step": 1335 - }, - { - "epoch": 8.196319018404909, - "grad_norm": 0.7559608817100525, - "learning_rate": 3.9332113534196194e-07, - "loss": 0.0028, - "step": 1336 - }, - { - "epoch": 8.202453987730062, - "grad_norm": 1.3025604486465454, - "learning_rate": 3.907306919461279e-07, - "loss": 0.0228, - "step": 1337 - }, - { - "epoch": 8.208588957055214, - "grad_norm": 0.6984004974365234, - "learning_rate": 3.8814808386483385e-07, - "loss": 0.0027, - "step": 1338 - }, - { - "epoch": 8.214723926380367, - "grad_norm": 1.161498785018921, - "learning_rate": 3.855733206917095e-07, - "loss": 0.0037, - "step": 1339 - }, - { - "epoch": 8.220858895705522, - "grad_norm": 0.5357164740562439, - "learning_rate": 3.8300641199124024e-07, - "loss": 0.0011, - "step": 1340 - }, - { - "epoch": 8.226993865030675, - "grad_norm": 0.8089649677276611, - "learning_rate": 3.80447367298738e-07, - "loss": 0.0008, - "step": 1341 - }, - { - "epoch": 8.233128834355828, - "grad_norm": 0.4289240539073944, - "learning_rate": 3.77896196120299e-07, - "loss": 0.0012, - "step": 1342 - }, - { - "epoch": 8.239263803680982, - "grad_norm": 0.8666973114013672, - "learning_rate": 3.7535290793277364e-07, - "loss": 0.0047, - "step": 1343 - }, - { - "epoch": 8.245398773006135, - "grad_norm": 0.6841573715209961, - "learning_rate": 3.7281751218372965e-07, - "loss": 0.0007, - "step": 1344 - }, - { - "epoch": 8.251533742331288, - "grad_norm": 0.5588045716285706, - "learning_rate": 3.7029001829141457e-07, - "loss": 0.0018, - "step": 1345 - }, - { - "epoch": 8.257668711656441, - "grad_norm": 1.7257133722305298, - "learning_rate": 3.677704356447254e-07, - "loss": 0.0213, - "step": 1346 - }, - { - "epoch": 8.263803680981596, - "grad_norm": 0.2352600246667862, - "learning_rate": 3.6525877360316875e-07, - "loss": 0.0009, - "step": 1347 - }, - { - "epoch": 8.269938650306749, - "grad_norm": 0.9622183442115784, - "learning_rate": 3.627550414968303e-07, - "loss": 0.0132, - "step": 1348 - }, - { - "epoch": 8.276073619631902, - "grad_norm": 0.5367354154586792, - "learning_rate": 3.6025924862633814e-07, - "loss": 0.0006, - "step": 1349 - }, - { - "epoch": 8.282208588957054, - "grad_norm": 1.5134315490722656, - "learning_rate": 3.577714042628272e-07, - "loss": 0.01, - "step": 1350 - }, - { - "epoch": 8.28834355828221, - "grad_norm": 1.5052622556686401, - "learning_rate": 3.5529151764790715e-07, - "loss": 0.0031, - "step": 1351 - }, - { - "epoch": 8.294478527607362, - "grad_norm": 0.8776562809944153, - "learning_rate": 3.5281959799362775e-07, - "loss": 0.0053, - "step": 1352 - }, - { - "epoch": 8.300613496932515, - "grad_norm": 0.7919799089431763, - "learning_rate": 3.503556544824413e-07, - "loss": 0.0021, - "step": 1353 - }, - { - "epoch": 8.30674846625767, - "grad_norm": 0.7141364216804504, - "learning_rate": 3.4789969626717377e-07, - "loss": 0.0019, - "step": 1354 - }, - { - "epoch": 8.312883435582823, - "grad_norm": 1.7783756256103516, - "learning_rate": 3.454517324709858e-07, - "loss": 0.0019, - "step": 1355 - }, - { - "epoch": 8.319018404907975, - "grad_norm": 0.9534929394721985, - "learning_rate": 3.43011772187343e-07, - "loss": 0.0011, - "step": 1356 - }, - { - "epoch": 8.325153374233128, - "grad_norm": 0.4383384585380554, - "learning_rate": 3.405798244799799e-07, - "loss": 0.0006, - "step": 1357 - }, - { - "epoch": 8.331288343558283, - "grad_norm": 0.8582566976547241, - "learning_rate": 3.3815589838286535e-07, - "loss": 0.002, - "step": 1358 - }, - { - "epoch": 8.337423312883436, - "grad_norm": 0.8288223743438721, - "learning_rate": 3.3574000290017174e-07, - "loss": 0.002, - "step": 1359 - }, - { - "epoch": 8.343558282208589, - "grad_norm": 1.2074549198150635, - "learning_rate": 3.3333214700623976e-07, - "loss": 0.0153, - "step": 1360 - }, - { - "epoch": 8.349693251533742, - "grad_norm": 0.5359098315238953, - "learning_rate": 3.3093233964554464e-07, - "loss": 0.0014, - "step": 1361 - }, - { - "epoch": 8.355828220858896, - "grad_norm": 1.6650397777557373, - "learning_rate": 3.2854058973266547e-07, - "loss": 0.0107, - "step": 1362 - }, - { - "epoch": 8.36196319018405, - "grad_norm": 1.1784273386001587, - "learning_rate": 3.261569061522474e-07, - "loss": 0.0197, - "step": 1363 - }, - { - "epoch": 8.368098159509202, - "grad_norm": 0.6566861271858215, - "learning_rate": 3.237812977589738e-07, - "loss": 0.0009, - "step": 1364 - }, - { - "epoch": 8.374233128834355, - "grad_norm": 0.9043551683425903, - "learning_rate": 3.2141377337753105e-07, - "loss": 0.0026, - "step": 1365 - }, - { - "epoch": 8.38036809815951, - "grad_norm": 2.205872058868408, - "learning_rate": 3.190543418025749e-07, - "loss": 0.0533, - "step": 1366 - }, - { - "epoch": 8.386503067484663, - "grad_norm": 0.2918683886528015, - "learning_rate": 3.167030117986994e-07, - "loss": 0.0007, - "step": 1367 - }, - { - "epoch": 8.392638036809815, - "grad_norm": 0.5370535850524902, - "learning_rate": 3.143597921004027e-07, - "loss": 0.001, - "step": 1368 - }, - { - "epoch": 8.39877300613497, - "grad_norm": 1.353083610534668, - "learning_rate": 3.120246914120564e-07, - "loss": 0.002, - "step": 1369 - }, - { - "epoch": 8.404907975460123, - "grad_norm": 0.644607424736023, - "learning_rate": 3.096977184078731e-07, - "loss": 0.0025, - "step": 1370 - }, - { - "epoch": 8.411042944785276, - "grad_norm": 0.7351365089416504, - "learning_rate": 3.0737888173187067e-07, - "loss": 0.0014, - "step": 1371 - }, - { - "epoch": 8.417177914110429, - "grad_norm": 1.161787748336792, - "learning_rate": 3.050681899978464e-07, - "loss": 0.0149, - "step": 1372 - }, - { - "epoch": 8.423312883435583, - "grad_norm": 1.7568200826644897, - "learning_rate": 3.0276565178933847e-07, - "loss": 0.0178, - "step": 1373 - }, - { - "epoch": 8.429447852760736, - "grad_norm": 0.73989337682724, - "learning_rate": 3.004712756595993e-07, - "loss": 0.0053, - "step": 1374 - }, - { - "epoch": 8.43558282208589, - "grad_norm": 1.8425425291061401, - "learning_rate": 2.9818507013156085e-07, - "loss": 0.0013, - "step": 1375 - }, - { - "epoch": 8.441717791411042, - "grad_norm": 0.6374561786651611, - "learning_rate": 2.9590704369780313e-07, - "loss": 0.0039, - "step": 1376 - }, - { - "epoch": 8.447852760736197, - "grad_norm": 0.708151638507843, - "learning_rate": 2.9363720482052436e-07, - "loss": 0.0025, - "step": 1377 - }, - { - "epoch": 8.45398773006135, - "grad_norm": 1.2846306562423706, - "learning_rate": 2.91375561931507e-07, - "loss": 0.0033, - "step": 1378 - }, - { - "epoch": 8.460122699386503, - "grad_norm": 0.347720742225647, - "learning_rate": 2.89122123432089e-07, - "loss": 0.0006, - "step": 1379 - }, - { - "epoch": 8.466257668711656, - "grad_norm": 0.9626922607421875, - "learning_rate": 2.868768976931313e-07, - "loss": 0.001, - "step": 1380 - }, - { - "epoch": 8.47239263803681, - "grad_norm": 0.26909729838371277, - "learning_rate": 2.8463989305498596e-07, - "loss": 0.0008, - "step": 1381 - }, - { - "epoch": 8.478527607361963, - "grad_norm": 0.8750791549682617, - "learning_rate": 2.824111178274669e-07, - "loss": 0.0025, - "step": 1382 - }, - { - "epoch": 8.484662576687116, - "grad_norm": 1.1124992370605469, - "learning_rate": 2.801905802898183e-07, - "loss": 0.0031, - "step": 1383 - }, - { - "epoch": 8.49079754601227, - "grad_norm": 0.4871549904346466, - "learning_rate": 2.779782886906829e-07, - "loss": 0.0013, - "step": 1384 - }, - { - "epoch": 8.496932515337424, - "grad_norm": 0.5207282900810242, - "learning_rate": 2.7577425124807324e-07, - "loss": 0.0013, - "step": 1385 - }, - { - "epoch": 8.503067484662576, - "grad_norm": 1.8369935750961304, - "learning_rate": 2.7357847614933876e-07, - "loss": 0.0031, - "step": 1386 - }, - { - "epoch": 8.50920245398773, - "grad_norm": 0.6390517354011536, - "learning_rate": 2.713909715511384e-07, - "loss": 0.0045, - "step": 1387 - }, - { - "epoch": 8.515337423312884, - "grad_norm": 0.8618245124816895, - "learning_rate": 2.692117455794077e-07, - "loss": 0.0017, - "step": 1388 - }, - { - "epoch": 8.521472392638037, - "grad_norm": 0.8506134152412415, - "learning_rate": 2.6704080632932895e-07, - "loss": 0.0014, - "step": 1389 - }, - { - "epoch": 8.52760736196319, - "grad_norm": 0.42547252774238586, - "learning_rate": 2.6487816186530263e-07, - "loss": 0.002, - "step": 1390 - }, - { - "epoch": 8.533742331288344, - "grad_norm": 0.6425843834877014, - "learning_rate": 2.6272382022091704e-07, - "loss": 0.0028, - "step": 1391 - }, - { - "epoch": 8.539877300613497, - "grad_norm": 0.8287162780761719, - "learning_rate": 2.6057778939891614e-07, - "loss": 0.011, - "step": 1392 - }, - { - "epoch": 8.54601226993865, - "grad_norm": 1.0402963161468506, - "learning_rate": 2.584400773711737e-07, - "loss": 0.0037, - "step": 1393 - }, - { - "epoch": 8.552147239263803, - "grad_norm": 0.9785431623458862, - "learning_rate": 2.5631069207865926e-07, - "loss": 0.0023, - "step": 1394 - }, - { - "epoch": 8.558282208588958, - "grad_norm": 1.2661131620407104, - "learning_rate": 2.541896414314132e-07, - "loss": 0.0053, - "step": 1395 - }, - { - "epoch": 8.56441717791411, - "grad_norm": 0.2662440240383148, - "learning_rate": 2.520769333085141e-07, - "loss": 0.0008, - "step": 1396 - }, - { - "epoch": 8.570552147239264, - "grad_norm": 0.628510594367981, - "learning_rate": 2.4997257555805064e-07, - "loss": 0.001, - "step": 1397 - }, - { - "epoch": 8.576687116564417, - "grad_norm": 1.08578622341156, - "learning_rate": 2.4787657599709276e-07, - "loss": 0.0041, - "step": 1398 - }, - { - "epoch": 8.582822085889571, - "grad_norm": 0.8213603496551514, - "learning_rate": 2.4578894241166135e-07, - "loss": 0.0029, - "step": 1399 - }, - { - "epoch": 8.588957055214724, - "grad_norm": 0.5261257886886597, - "learning_rate": 2.4370968255670093e-07, - "loss": 0.001, - "step": 1400 - }, - { - "epoch": 8.595092024539877, - "grad_norm": 0.18139345943927765, - "learning_rate": 2.4163880415604913e-07, - "loss": 0.0005, - "step": 1401 - }, - { - "epoch": 8.60122699386503, - "grad_norm": 0.8317165970802307, - "learning_rate": 2.395763149024102e-07, - "loss": 0.0034, - "step": 1402 - }, - { - "epoch": 8.607361963190185, - "grad_norm": 1.272074580192566, - "learning_rate": 2.3752222245732454e-07, - "loss": 0.0036, - "step": 1403 - }, - { - "epoch": 8.613496932515337, - "grad_norm": 0.5556488633155823, - "learning_rate": 2.3547653445114032e-07, - "loss": 0.0013, - "step": 1404 - }, - { - "epoch": 8.61963190184049, - "grad_norm": 0.6546408534049988, - "learning_rate": 2.334392584829867e-07, - "loss": 0.0008, - "step": 1405 - }, - { - "epoch": 8.625766871165645, - "grad_norm": 2.021836996078491, - "learning_rate": 2.3141040212074445e-07, - "loss": 0.0198, - "step": 1406 - }, - { - "epoch": 8.631901840490798, - "grad_norm": 0.6017210483551025, - "learning_rate": 2.293899729010171e-07, - "loss": 0.0033, - "step": 1407 - }, - { - "epoch": 8.63803680981595, - "grad_norm": 0.315134733915329, - "learning_rate": 2.2737797832910498e-07, - "loss": 0.0007, - "step": 1408 - }, - { - "epoch": 8.644171779141104, - "grad_norm": 0.7090817093849182, - "learning_rate": 2.2537442587897474e-07, - "loss": 0.0045, - "step": 1409 - }, - { - "epoch": 8.650306748466258, - "grad_norm": 0.26951614022254944, - "learning_rate": 2.2337932299323434e-07, - "loss": 0.001, - "step": 1410 - }, - { - "epoch": 8.656441717791411, - "grad_norm": 0.21670447289943695, - "learning_rate": 2.2139267708310457e-07, - "loss": 0.0005, - "step": 1411 - }, - { - "epoch": 8.662576687116564, - "grad_norm": 1.070379376411438, - "learning_rate": 2.194144955283886e-07, - "loss": 0.0022, - "step": 1412 - }, - { - "epoch": 8.668711656441717, - "grad_norm": 0.7644438147544861, - "learning_rate": 2.1744478567744947e-07, - "loss": 0.0023, - "step": 1413 - }, - { - "epoch": 8.674846625766872, - "grad_norm": 1.053305983543396, - "learning_rate": 2.154835548471798e-07, - "loss": 0.0027, - "step": 1414 - }, - { - "epoch": 8.680981595092025, - "grad_norm": 0.5719135403633118, - "learning_rate": 2.1353081032297356e-07, - "loss": 0.0015, - "step": 1415 - }, - { - "epoch": 8.687116564417177, - "grad_norm": 0.3360785245895386, - "learning_rate": 2.1158655935870325e-07, - "loss": 0.0025, - "step": 1416 - }, - { - "epoch": 8.69325153374233, - "grad_norm": 0.867242693901062, - "learning_rate": 2.0965080917668744e-07, - "loss": 0.002, - "step": 1417 - }, - { - "epoch": 8.699386503067485, - "grad_norm": 1.1389360427856445, - "learning_rate": 2.077235669676689e-07, - "loss": 0.0023, - "step": 1418 - }, - { - "epoch": 8.705521472392638, - "grad_norm": 0.31157732009887695, - "learning_rate": 2.0580483989078525e-07, - "loss": 0.0005, - "step": 1419 - }, - { - "epoch": 8.71165644171779, - "grad_norm": 1.328353762626648, - "learning_rate": 2.0389463507354211e-07, - "loss": 0.0122, - "step": 1420 - }, - { - "epoch": 8.717791411042946, - "grad_norm": 0.13456307351589203, - "learning_rate": 2.0199295961178893e-07, - "loss": 0.0005, - "step": 1421 - }, - { - "epoch": 8.723926380368098, - "grad_norm": 0.7963683605194092, - "learning_rate": 2.000998205696894e-07, - "loss": 0.004, - "step": 1422 - }, - { - "epoch": 8.730061349693251, - "grad_norm": 0.1814875602722168, - "learning_rate": 1.9821522497969813e-07, - "loss": 0.0004, - "step": 1423 - }, - { - "epoch": 8.736196319018404, - "grad_norm": 0.4806751012802124, - "learning_rate": 1.9633917984253294e-07, - "loss": 0.001, - "step": 1424 - }, - { - "epoch": 8.742331288343559, - "grad_norm": 0.6554126143455505, - "learning_rate": 1.944716921271489e-07, - "loss": 0.0019, - "step": 1425 - }, - { - "epoch": 8.748466257668712, - "grad_norm": 0.7839532494544983, - "learning_rate": 1.9261276877071354e-07, - "loss": 0.0055, - "step": 1426 - }, - { - "epoch": 8.754601226993865, - "grad_norm": 1.1153522729873657, - "learning_rate": 1.9076241667857988e-07, - "loss": 0.0048, - "step": 1427 - }, - { - "epoch": 8.76073619631902, - "grad_norm": 1.4735853672027588, - "learning_rate": 1.8892064272426042e-07, - "loss": 0.0079, - "step": 1428 - }, - { - "epoch": 8.766871165644172, - "grad_norm": 0.9770727157592773, - "learning_rate": 1.8708745374940469e-07, - "loss": 0.0013, - "step": 1429 - }, - { - "epoch": 8.773006134969325, - "grad_norm": 1.5710560083389282, - "learning_rate": 1.8526285656376873e-07, - "loss": 0.0046, - "step": 1430 - }, - { - "epoch": 8.779141104294478, - "grad_norm": 0.9026464819908142, - "learning_rate": 1.8344685794519507e-07, - "loss": 0.006, - "step": 1431 - }, - { - "epoch": 8.785276073619633, - "grad_norm": 1.2195831537246704, - "learning_rate": 1.8163946463958276e-07, - "loss": 0.0094, - "step": 1432 - }, - { - "epoch": 8.791411042944786, - "grad_norm": 0.31636637449264526, - "learning_rate": 1.7984068336086652e-07, - "loss": 0.0009, - "step": 1433 - }, - { - "epoch": 8.797546012269938, - "grad_norm": 0.5591960549354553, - "learning_rate": 1.780505207909894e-07, - "loss": 0.0014, - "step": 1434 - }, - { - "epoch": 8.803680981595091, - "grad_norm": 0.5905728340148926, - "learning_rate": 1.7626898357987782e-07, - "loss": 0.0013, - "step": 1435 - }, - { - "epoch": 8.809815950920246, - "grad_norm": 1.0983483791351318, - "learning_rate": 1.744960783454186e-07, - "loss": 0.0024, - "step": 1436 - }, - { - "epoch": 8.815950920245399, - "grad_norm": 0.7398350238800049, - "learning_rate": 1.727318116734328e-07, - "loss": 0.0015, - "step": 1437 - }, - { - "epoch": 8.822085889570552, - "grad_norm": 0.4621620774269104, - "learning_rate": 1.7097619011765127e-07, - "loss": 0.0017, - "step": 1438 - }, - { - "epoch": 8.828220858895705, - "grad_norm": 0.8077200055122375, - "learning_rate": 1.6922922019969145e-07, - "loss": 0.0009, - "step": 1439 - }, - { - "epoch": 8.83435582822086, - "grad_norm": 0.7134829163551331, - "learning_rate": 1.6749090840903233e-07, - "loss": 0.0013, - "step": 1440 - }, - { - "epoch": 8.840490797546012, - "grad_norm": 1.2837457656860352, - "learning_rate": 1.6576126120299046e-07, - "loss": 0.0029, - "step": 1441 - }, - { - "epoch": 8.846625766871165, - "grad_norm": 0.8713163137435913, - "learning_rate": 1.6404028500669633e-07, - "loss": 0.0034, - "step": 1442 - }, - { - "epoch": 8.85276073619632, - "grad_norm": 0.5622571706771851, - "learning_rate": 1.6232798621306918e-07, - "loss": 0.0022, - "step": 1443 - }, - { - "epoch": 8.858895705521473, - "grad_norm": 2.460902214050293, - "learning_rate": 1.606243711827951e-07, - "loss": 0.0329, - "step": 1444 - }, - { - "epoch": 8.865030674846626, - "grad_norm": 1.5952033996582031, - "learning_rate": 1.5892944624430334e-07, - "loss": 0.0092, - "step": 1445 - }, - { - "epoch": 8.871165644171779, - "grad_norm": 0.16087445616722107, - "learning_rate": 1.5724321769374023e-07, - "loss": 0.0005, - "step": 1446 - }, - { - "epoch": 8.877300613496933, - "grad_norm": 0.33085283637046814, - "learning_rate": 1.5556569179494857e-07, - "loss": 0.0005, - "step": 1447 - }, - { - "epoch": 8.883435582822086, - "grad_norm": 0.15866753458976746, - "learning_rate": 1.538968747794431e-07, - "loss": 0.0004, - "step": 1448 - }, - { - "epoch": 8.889570552147239, - "grad_norm": 1.0744353532791138, - "learning_rate": 1.5223677284638805e-07, - "loss": 0.0046, - "step": 1449 - }, - { - "epoch": 8.895705521472392, - "grad_norm": 0.8372928500175476, - "learning_rate": 1.5058539216257356e-07, - "loss": 0.0048, - "step": 1450 - }, - { - "epoch": 8.901840490797547, - "grad_norm": 1.0015332698822021, - "learning_rate": 1.4894273886239208e-07, - "loss": 0.0027, - "step": 1451 - }, - { - "epoch": 8.9079754601227, - "grad_norm": 1.1478570699691772, - "learning_rate": 1.473088190478178e-07, - "loss": 0.0134, - "step": 1452 - }, - { - "epoch": 8.914110429447852, - "grad_norm": 0.8685131669044495, - "learning_rate": 1.4568363878838087e-07, - "loss": 0.0024, - "step": 1453 - }, - { - "epoch": 8.920245398773005, - "grad_norm": 0.46051493287086487, - "learning_rate": 1.4406720412114828e-07, - "loss": 0.0019, - "step": 1454 - }, - { - "epoch": 8.92638036809816, - "grad_norm": 0.75945645570755, - "learning_rate": 1.4245952105069905e-07, - "loss": 0.0015, - "step": 1455 - }, - { - "epoch": 8.932515337423313, - "grad_norm": 1.2880934476852417, - "learning_rate": 1.4086059554910186e-07, - "loss": 0.0045, - "step": 1456 - }, - { - "epoch": 8.938650306748466, - "grad_norm": 0.2242523580789566, - "learning_rate": 1.3927043355589476e-07, - "loss": 0.0011, - "step": 1457 - }, - { - "epoch": 8.94478527607362, - "grad_norm": 1.0341970920562744, - "learning_rate": 1.3768904097806153e-07, - "loss": 0.0019, - "step": 1458 - }, - { - "epoch": 8.950920245398773, - "grad_norm": 0.8955618739128113, - "learning_rate": 1.361164236900092e-07, - "loss": 0.0027, - "step": 1459 - }, - { - "epoch": 8.957055214723926, - "grad_norm": 1.3581833839416504, - "learning_rate": 1.3455258753354932e-07, - "loss": 0.0048, - "step": 1460 - }, - { - "epoch": 8.963190184049079, - "grad_norm": 1.5094419717788696, - "learning_rate": 1.3299753831787193e-07, - "loss": 0.0011, - "step": 1461 - }, - { - "epoch": 8.969325153374234, - "grad_norm": 0.5978104472160339, - "learning_rate": 1.3145128181952737e-07, - "loss": 0.0018, - "step": 1462 - }, - { - "epoch": 8.975460122699387, - "grad_norm": 0.7072922587394714, - "learning_rate": 1.2991382378240325e-07, - "loss": 0.0032, - "step": 1463 - }, - { - "epoch": 8.98159509202454, - "grad_norm": 0.5541467666625977, - "learning_rate": 1.2838516991770355e-07, - "loss": 0.001, - "step": 1464 - }, - { - "epoch": 8.987730061349692, - "grad_norm": 0.6946907043457031, - "learning_rate": 1.2686532590392763e-07, - "loss": 0.0024, - "step": 1465 - }, - { - "epoch": 8.993865030674847, - "grad_norm": 0.3228455185890198, - "learning_rate": 1.2535429738684822e-07, - "loss": 0.0007, - "step": 1466 - }, - { - "epoch": 9.0, - "grad_norm": 2.4403252601623535, - "learning_rate": 1.238520899794915e-07, - "loss": 0.0245, - "step": 1467 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 3.632019917168968e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-163/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00001-of-00007.safetensors deleted file mode 100644 index 4f612f944da9efdbc01e361ba319ae2d70a72b12..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1352a8808b78cf75f14044399318379444853cc369118a4e68c2621de4b2d489 -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00002-of-00007.safetensors deleted file mode 100644 index a529b41f84ba1d298e44296c88abc109eb2df4b1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ee67f6394a8f4072ac09caabd34ba0db6c2ca9d8cf65443df2cd3140c3424e17 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00003-of-00007.safetensors deleted file mode 100644 index 69c452e2f467c2f9512b942911e66d41eb1f2687..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba7044bac97ef370ca21b49acfeb4eb91610bce213061186fa5f0c44387d0cf1 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00004-of-00007.safetensors deleted file mode 100644 index 10334c247112adffd327413b22c3d48fab85c206..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ec9a6af593c1d1ef3c30f2fd6a93246887e02fcddb54a743870727baee55a1c -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00005-of-00007.safetensors deleted file mode 100644 index bd42cfd1a7c53743a682c6fad856d2fe48d10b04..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:74031c7ffe38001b7f36c79f62db52972ed9855b43ed58c1efb26f9b1151d7df -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00006-of-00007.safetensors deleted file mode 100644 index 502edf117ab4d1a33e81814f4d25421ff9a77de4..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6921791f6e8813b057402ff42b735b2fe2b54b52d71c5e0b0c517969ee0df4a3 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00007-of-00007.safetensors deleted file mode 100644 index 1e11d2e551ce3b23f8b5bbdf3f00070401eb45ff..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bba0bd20610cea781962bbb0b646d3ddbb829ea6161a87cf8fb483b3cd9c1855 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_0.pth deleted file mode 100644 index 9c287de26f76b389db025ad109f0595b0b77fd22..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92cc13315f24c28015d695b6cde08bb1cd6fea4cbc435998485ed6fbe4c91285 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_1.pth deleted file mode 100644 index 132db267a0f5617620f48bc8eab9cc37a9aea13a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4c154b6a63e0b1f98f7d2847944398f99f1657d35e8eddf7fdf0ae2c24b0552 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_2.pth deleted file mode 100644 index e85bf2eceab47cefd59df592648941c61c84eab1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f784c6a9507b51189f2caffbd178ea9882103b75852e31c15f47fdae6a43af1d -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_3.pth deleted file mode 100644 index 423bb6c008eeb6875c659dd108c5f003758dbcb9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34b023e05bc2d12b91dc436d4922b990d50ec8dc56d40dc3e36b3bb34fc81341 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-163/scheduler.pt deleted file mode 100644 index b16c05e1aae13c3d7cd90b739baa69d046c7d74e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0bbc8ba76f7469c0182470eda31d2126f04c1866e2fcb05b17f346f839cb5099 -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-163/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-163/trainer_state.json deleted file mode 100644 index 6d1378e534430bd764af1f01625cf0b940470592..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-163/trainer_state.json +++ /dev/null @@ -1,1175 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 163, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.029784817192141e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-1630/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1630/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1630/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00001-of-00007.safetensors deleted file mode 100644 index ef6a28c1e9e227311a947d238a462b8f1aea5688..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34d9387733813d5a6f8cf9e9e1500319b6f18f7239cd826153c357461c42879f -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00002-of-00007.safetensors deleted file mode 100644 index 5e86287f4f41b8de773a003247237b94722cb296..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69e312ac446f71fe286935413a92e19b0f116e972bb4ecca7ba373d35234e258 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00003-of-00007.safetensors deleted file mode 100644 index d26dcf2f537cfdd35b71e7e5f3e5f77ed2ae968a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8397232e2e465a66601fe6839f1b4887250275646947034ff16ec493807ab154 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00004-of-00007.safetensors deleted file mode 100644 index 327b5dccda8ad855f23f2ac1355d727fd228d9d9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:474e541c2f4198dbf42cb93fb0a0cdc9e27156ed753bacd7af9c6fca13cb52b4 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00005-of-00007.safetensors deleted file mode 100644 index 76ac6d2a48b10e807ac6e61f11e719484922b8f3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7bb3488785efc6e3418aead91bb86553e6fcfc4d713e959fdd2d62433567995 -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00006-of-00007.safetensors deleted file mode 100644 index 5e6d6e3eb089537dc914111fbe1c33915ab5f6a9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac5b4994c94ffc85616db148a42e47b88df63502ff64855da8b056ba16bb73df -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00007-of-00007.safetensors deleted file mode 100644 index 186db3249e73f268d626d8f916e0599fe32bf3c2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58a8aeb1d252ba89c6a9b427e6f9aabb7fc425ae5229b6cb42e79da45e892fe8 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-1630/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_0.pth deleted file mode 100644 index 0c73dd943c40497990387f5f3dacb08ddd27a929..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7dbc6521b0b64cb12d818506108fcf257a4089ca8a9b1e453776ed3e032e7176 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_1.pth deleted file mode 100644 index f57618444fac32f854b52c01ec2e258a65bc4d96..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b13e3da1b0679cab1bab94f893e385a9a224d3335b5a6f62602f33c2be88d03 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_2.pth deleted file mode 100644 index fb07e7b31bd4b60cb7c279157d2ffa4f268cb36a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6a24f0e0f117b5a8236e0d12594c0c358f41ef00068d4460002e95ad1cc3cb1c -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_3.pth deleted file mode 100644 index 87c3a97e34da9032aeacc1dafd124d92425dabdd..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e46e4eab6c4a25d84ad36ddf1357401788adeeb6388c03cefa35a63b52ee7610 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-1630/scheduler.pt deleted file mode 100644 index 6b87d6b0875c0c9792a55876b2fe625c22e329f8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc42ff790b7e763aa57664f18c0bd94927dd976a623cabd755d52a2927b24c7a -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-1630/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-1630/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-1630/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-1630/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-1630/trainer_state.json deleted file mode 100644 index f96c6fb4a539dfd6db9a11fb9bc5f825b781de24..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-1630/trainer_state.json +++ /dev/null @@ -1,11444 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 10.0, - "eval_steps": 500, - "global_step": 1630, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - }, - { - "epoch": 3.0061349693251533, - "grad_norm": 5.331607818603516, - "learning_rate": 3.969463130731183e-06, - "loss": 0.3267, - "step": 490 - }, - { - "epoch": 3.0122699386503067, - "grad_norm": 3.453650712966919, - "learning_rate": 3.965562241492401e-06, - "loss": 0.2719, - "step": 491 - }, - { - "epoch": 3.01840490797546, - "grad_norm": 3.232313632965088, - "learning_rate": 3.9616559081213335e-06, - "loss": 0.1825, - "step": 492 - }, - { - "epoch": 3.0245398773006134, - "grad_norm": 3.4860260486602783, - "learning_rate": 3.957744145128858e-06, - "loss": 0.1854, - "step": 493 - }, - { - "epoch": 3.0306748466257667, - "grad_norm": 3.4357805252075195, - "learning_rate": 3.953826967046021e-06, - "loss": 0.2224, - "step": 494 - }, - { - "epoch": 3.03680981595092, - "grad_norm": 4.557503700256348, - "learning_rate": 3.9499043884239894e-06, - "loss": 0.349, - "step": 495 - }, - { - "epoch": 3.042944785276074, - "grad_norm": 4.685214042663574, - "learning_rate": 3.945976423833987e-06, - "loss": 0.175, - "step": 496 - }, - { - "epoch": 3.049079754601227, - "grad_norm": 3.7430171966552734, - "learning_rate": 3.942043087867244e-06, - "loss": 0.2773, - "step": 497 - }, - { - "epoch": 3.0552147239263805, - "grad_norm": 3.756450653076172, - "learning_rate": 3.938104395134947e-06, - "loss": 0.4445, - "step": 498 - }, - { - "epoch": 3.061349693251534, - "grad_norm": 4.049175262451172, - "learning_rate": 3.9341603602681805e-06, - "loss": 0.3046, - "step": 499 - }, - { - "epoch": 3.067484662576687, - "grad_norm": 3.7689461708068848, - "learning_rate": 3.930210997917871e-06, - "loss": 0.2544, - "step": 500 - }, - { - "epoch": 3.0736196319018405, - "grad_norm": 4.027602195739746, - "learning_rate": 3.92625632275474e-06, - "loss": 0.3154, - "step": 501 - }, - { - "epoch": 3.079754601226994, - "grad_norm": 2.8449292182922363, - "learning_rate": 3.922296349469239e-06, - "loss": 0.2804, - "step": 502 - }, - { - "epoch": 3.085889570552147, - "grad_norm": 2.9555234909057617, - "learning_rate": 3.918331092771505e-06, - "loss": 0.2393, - "step": 503 - }, - { - "epoch": 3.0920245398773005, - "grad_norm": 2.621042013168335, - "learning_rate": 3.914360567391296e-06, - "loss": 0.1403, - "step": 504 - }, - { - "epoch": 3.098159509202454, - "grad_norm": 3.2348620891571045, - "learning_rate": 3.910384788077949e-06, - "loss": 0.1537, - "step": 505 - }, - { - "epoch": 3.104294478527607, - "grad_norm": 3.030179977416992, - "learning_rate": 3.906403769600311e-06, - "loss": 0.2921, - "step": 506 - }, - { - "epoch": 3.1104294478527605, - "grad_norm": 3.146428346633911, - "learning_rate": 3.902417526746694e-06, - "loss": 0.2036, - "step": 507 - }, - { - "epoch": 3.116564417177914, - "grad_norm": 3.6201512813568115, - "learning_rate": 3.898426074324818e-06, - "loss": 0.2655, - "step": 508 - }, - { - "epoch": 3.1226993865030677, - "grad_norm": 3.7674012184143066, - "learning_rate": 3.8944294271617524e-06, - "loss": 0.3938, - "step": 509 - }, - { - "epoch": 3.128834355828221, - "grad_norm": 4.54722785949707, - "learning_rate": 3.890427600103865e-06, - "loss": 0.3051, - "step": 510 - }, - { - "epoch": 3.1349693251533743, - "grad_norm": 4.228236675262451, - "learning_rate": 3.886420608016767e-06, - "loss": 0.3719, - "step": 511 - }, - { - "epoch": 3.1411042944785277, - "grad_norm": 4.355110168457031, - "learning_rate": 3.882408465785252e-06, - "loss": 0.1863, - "step": 512 - }, - { - "epoch": 3.147239263803681, - "grad_norm": 3.451460838317871, - "learning_rate": 3.878391188313249e-06, - "loss": 0.1479, - "step": 513 - }, - { - "epoch": 3.1533742331288344, - "grad_norm": 4.395524501800537, - "learning_rate": 3.87436879052376e-06, - "loss": 0.238, - "step": 514 - }, - { - "epoch": 3.1595092024539877, - "grad_norm": 2.940717935562134, - "learning_rate": 3.870341287358809e-06, - "loss": 0.2069, - "step": 515 - }, - { - "epoch": 3.165644171779141, - "grad_norm": 2.5817320346832275, - "learning_rate": 3.8663086937793845e-06, - "loss": 0.1189, - "step": 516 - }, - { - "epoch": 3.1717791411042944, - "grad_norm": 3.9863343238830566, - "learning_rate": 3.862271024765385e-06, - "loss": 0.3434, - "step": 517 - }, - { - "epoch": 3.1779141104294477, - "grad_norm": 3.609004259109497, - "learning_rate": 3.8582282953155626e-06, - "loss": 0.1602, - "step": 518 - }, - { - "epoch": 3.184049079754601, - "grad_norm": 3.207533121109009, - "learning_rate": 3.854180520447465e-06, - "loss": 0.3452, - "step": 519 - }, - { - "epoch": 3.190184049079755, - "grad_norm": 3.593388795852661, - "learning_rate": 3.850127715197387e-06, - "loss": 0.2832, - "step": 520 - }, - { - "epoch": 3.196319018404908, - "grad_norm": 3.409064531326294, - "learning_rate": 3.846069894620306e-06, - "loss": 0.1481, - "step": 521 - }, - { - "epoch": 3.2024539877300615, - "grad_norm": 3.461498737335205, - "learning_rate": 3.84200707378983e-06, - "loss": 0.1283, - "step": 522 - }, - { - "epoch": 3.208588957055215, - "grad_norm": 3.708467483520508, - "learning_rate": 3.8379392677981434e-06, - "loss": 0.2468, - "step": 523 - }, - { - "epoch": 3.214723926380368, - "grad_norm": 2.802381753921509, - "learning_rate": 3.833866491755947e-06, - "loss": 0.2685, - "step": 524 - }, - { - "epoch": 3.2208588957055215, - "grad_norm": 3.0787744522094727, - "learning_rate": 3.8297887607924044e-06, - "loss": 0.2595, - "step": 525 - }, - { - "epoch": 3.226993865030675, - "grad_norm": 3.3952548503875732, - "learning_rate": 3.825706090055088e-06, - "loss": 0.4099, - "step": 526 - }, - { - "epoch": 3.233128834355828, - "grad_norm": 3.3497085571289062, - "learning_rate": 3.821618494709916e-06, - "loss": 0.287, - "step": 527 - }, - { - "epoch": 3.2392638036809815, - "grad_norm": 4.050611972808838, - "learning_rate": 3.817525989941102e-06, - "loss": 0.2369, - "step": 528 - }, - { - "epoch": 3.245398773006135, - "grad_norm": 2.87642240524292, - "learning_rate": 3.8134285909510972e-06, - "loss": 0.2751, - "step": 529 - }, - { - "epoch": 3.2515337423312882, - "grad_norm": 3.821941614151001, - "learning_rate": 3.8093263129605305e-06, - "loss": 0.2363, - "step": 530 - }, - { - "epoch": 3.2576687116564416, - "grad_norm": 2.8066117763519287, - "learning_rate": 3.80521917120816e-06, - "loss": 0.094, - "step": 531 - }, - { - "epoch": 3.263803680981595, - "grad_norm": 3.849768877029419, - "learning_rate": 3.801107180950806e-06, - "loss": 0.4117, - "step": 532 - }, - { - "epoch": 3.2699386503067487, - "grad_norm": 2.4161250591278076, - "learning_rate": 3.7969903574633028e-06, - "loss": 0.1183, - "step": 533 - }, - { - "epoch": 3.276073619631902, - "grad_norm": 3.6743111610412598, - "learning_rate": 3.792868716038437e-06, - "loss": 0.2296, - "step": 534 - }, - { - "epoch": 3.2822085889570554, - "grad_norm": 4.378123760223389, - "learning_rate": 3.7887422719868937e-06, - "loss": 0.2678, - "step": 535 - }, - { - "epoch": 3.2883435582822087, - "grad_norm": 4.816481590270996, - "learning_rate": 3.784611040637198e-06, - "loss": 0.4887, - "step": 536 - }, - { - "epoch": 3.294478527607362, - "grad_norm": 3.5712430477142334, - "learning_rate": 3.7804750373356576e-06, - "loss": 0.3827, - "step": 537 - }, - { - "epoch": 3.3006134969325154, - "grad_norm": 3.6877355575561523, - "learning_rate": 3.776334277446307e-06, - "loss": 0.3233, - "step": 538 - }, - { - "epoch": 3.3067484662576687, - "grad_norm": 3.442706346511841, - "learning_rate": 3.7721887763508512e-06, - "loss": 0.1256, - "step": 539 - }, - { - "epoch": 3.312883435582822, - "grad_norm": 3.9265615940093994, - "learning_rate": 3.7680385494486053e-06, - "loss": 0.3845, - "step": 540 - }, - { - "epoch": 3.3190184049079754, - "grad_norm": 3.5030126571655273, - "learning_rate": 3.7638836121564414e-06, - "loss": 0.2905, - "step": 541 - }, - { - "epoch": 3.3251533742331287, - "grad_norm": 3.6685378551483154, - "learning_rate": 3.7597239799087283e-06, - "loss": 0.3561, - "step": 542 - }, - { - "epoch": 3.331288343558282, - "grad_norm": 3.8484046459198, - "learning_rate": 3.7555596681572736e-06, - "loss": 0.1157, - "step": 543 - }, - { - "epoch": 3.3374233128834354, - "grad_norm": 3.7977402210235596, - "learning_rate": 3.751390692371272e-06, - "loss": 0.3049, - "step": 544 - }, - { - "epoch": 3.3435582822085887, - "grad_norm": 3.4409852027893066, - "learning_rate": 3.7472170680372398e-06, - "loss": 0.1626, - "step": 545 - }, - { - "epoch": 3.3496932515337425, - "grad_norm": 3.801541328430176, - "learning_rate": 3.7430388106589632e-06, - "loss": 0.2414, - "step": 546 - }, - { - "epoch": 3.355828220858896, - "grad_norm": 4.025203704833984, - "learning_rate": 3.738855935757438e-06, - "loss": 0.3441, - "step": 547 - }, - { - "epoch": 3.361963190184049, - "grad_norm": 4.242798805236816, - "learning_rate": 3.7346684588708135e-06, - "loss": 0.5244, - "step": 548 - }, - { - "epoch": 3.3680981595092025, - "grad_norm": 3.0516819953918457, - "learning_rate": 3.7304763955543332e-06, - "loss": 0.1984, - "step": 549 - }, - { - "epoch": 3.374233128834356, - "grad_norm": 3.894667625427246, - "learning_rate": 3.726279761380279e-06, - "loss": 0.2715, - "step": 550 - }, - { - "epoch": 3.3803680981595092, - "grad_norm": 3.171208143234253, - "learning_rate": 3.72207857193791e-06, - "loss": 0.1537, - "step": 551 - }, - { - "epoch": 3.3865030674846626, - "grad_norm": 4.344860553741455, - "learning_rate": 3.7178728428334092e-06, - "loss": 0.2388, - "step": 552 - }, - { - "epoch": 3.392638036809816, - "grad_norm": 2.766317367553711, - "learning_rate": 3.7136625896898226e-06, - "loss": 0.1726, - "step": 553 - }, - { - "epoch": 3.3987730061349692, - "grad_norm": 3.550662040710449, - "learning_rate": 3.7094478281470003e-06, - "loss": 0.2942, - "step": 554 - }, - { - "epoch": 3.4049079754601226, - "grad_norm": 3.4576945304870605, - "learning_rate": 3.7052285738615412e-06, - "loss": 0.1665, - "step": 555 - }, - { - "epoch": 3.411042944785276, - "grad_norm": 4.026793003082275, - "learning_rate": 3.7010048425067317e-06, - "loss": 0.3954, - "step": 556 - }, - { - "epoch": 3.4171779141104293, - "grad_norm": 4.600133419036865, - "learning_rate": 3.696776649772492e-06, - "loss": 0.3207, - "step": 557 - }, - { - "epoch": 3.4233128834355826, - "grad_norm": 4.747331142425537, - "learning_rate": 3.692544011365312e-06, - "loss": 0.1325, - "step": 558 - }, - { - "epoch": 3.4294478527607364, - "grad_norm": 3.781464099884033, - "learning_rate": 3.6883069430081986e-06, - "loss": 0.1644, - "step": 559 - }, - { - "epoch": 3.4355828220858897, - "grad_norm": 2.905986785888672, - "learning_rate": 3.6840654604406135e-06, - "loss": 0.2469, - "step": 560 - }, - { - "epoch": 3.441717791411043, - "grad_norm": 2.3747711181640625, - "learning_rate": 3.679819579418414e-06, - "loss": 0.1146, - "step": 561 - }, - { - "epoch": 3.4478527607361964, - "grad_norm": 3.2683632373809814, - "learning_rate": 3.6755693157137995e-06, - "loss": 0.3236, - "step": 562 - }, - { - "epoch": 3.4539877300613497, - "grad_norm": 3.7750496864318848, - "learning_rate": 3.6713146851152487e-06, - "loss": 0.399, - "step": 563 - }, - { - "epoch": 3.460122699386503, - "grad_norm": 3.3912384510040283, - "learning_rate": 3.667055703427461e-06, - "loss": 0.1259, - "step": 564 - }, - { - "epoch": 3.4662576687116564, - "grad_norm": 3.0224430561065674, - "learning_rate": 3.6627923864713e-06, - "loss": 0.1835, - "step": 565 - }, - { - "epoch": 3.4723926380368098, - "grad_norm": 3.642258405685425, - "learning_rate": 3.658524750083733e-06, - "loss": 0.2763, - "step": 566 - }, - { - "epoch": 3.478527607361963, - "grad_norm": 3.409890651702881, - "learning_rate": 3.654252810117773e-06, - "loss": 0.2496, - "step": 567 - }, - { - "epoch": 3.4846625766871164, - "grad_norm": 3.0416476726531982, - "learning_rate": 3.6499765824424195e-06, - "loss": 0.1287, - "step": 568 - }, - { - "epoch": 3.4907975460122698, - "grad_norm": 3.1963987350463867, - "learning_rate": 3.6456960829425987e-06, - "loss": 0.1747, - "step": 569 - }, - { - "epoch": 3.4969325153374236, - "grad_norm": 3.198448657989502, - "learning_rate": 3.641411327519107e-06, - "loss": 0.1913, - "step": 570 - }, - { - "epoch": 3.5030674846625764, - "grad_norm": 3.7023441791534424, - "learning_rate": 3.6371223320885492e-06, - "loss": 0.3224, - "step": 571 - }, - { - "epoch": 3.5092024539877302, - "grad_norm": 4.54288387298584, - "learning_rate": 3.6328291125832803e-06, - "loss": 0.2364, - "step": 572 - }, - { - "epoch": 3.5153374233128836, - "grad_norm": 3.5064890384674072, - "learning_rate": 3.628531684951347e-06, - "loss": 0.2552, - "step": 573 - }, - { - "epoch": 3.521472392638037, - "grad_norm": 3.987583875656128, - "learning_rate": 3.6242300651564276e-06, - "loss": 0.3232, - "step": 574 - }, - { - "epoch": 3.5276073619631902, - "grad_norm": 3.179642915725708, - "learning_rate": 3.6199242691777745e-06, - "loss": 0.32, - "step": 575 - }, - { - "epoch": 3.5337423312883436, - "grad_norm": 3.3078157901763916, - "learning_rate": 3.6156143130101516e-06, - "loss": 0.2922, - "step": 576 - }, - { - "epoch": 3.539877300613497, - "grad_norm": 3.1628613471984863, - "learning_rate": 3.6113002126637765e-06, - "loss": 0.2005, - "step": 577 - }, - { - "epoch": 3.5460122699386503, - "grad_norm": 3.4515540599823, - "learning_rate": 3.606981984164263e-06, - "loss": 0.2138, - "step": 578 - }, - { - "epoch": 3.5521472392638036, - "grad_norm": 5.132473945617676, - "learning_rate": 3.6026596435525578e-06, - "loss": 0.4382, - "step": 579 - }, - { - "epoch": 3.558282208588957, - "grad_norm": 3.397614002227783, - "learning_rate": 3.5983332068848855e-06, - "loss": 0.3326, - "step": 580 - }, - { - "epoch": 3.5644171779141103, - "grad_norm": 4.79497766494751, - "learning_rate": 3.5940026902326825e-06, - "loss": 0.4748, - "step": 581 - }, - { - "epoch": 3.5705521472392636, - "grad_norm": 3.7675018310546875, - "learning_rate": 3.5896681096825446e-06, - "loss": 0.2692, - "step": 582 - }, - { - "epoch": 3.5766871165644174, - "grad_norm": 3.0637521743774414, - "learning_rate": 3.5853294813361614e-06, - "loss": 0.3658, - "step": 583 - }, - { - "epoch": 3.5828220858895703, - "grad_norm": 2.8949790000915527, - "learning_rate": 3.5809868213102623e-06, - "loss": 0.1661, - "step": 584 - }, - { - "epoch": 3.588957055214724, - "grad_norm": 3.163419246673584, - "learning_rate": 3.5766401457365485e-06, - "loss": 0.1233, - "step": 585 - }, - { - "epoch": 3.5950920245398774, - "grad_norm": 3.1787965297698975, - "learning_rate": 3.5722894707616417e-06, - "loss": 0.278, - "step": 586 - }, - { - "epoch": 3.6012269938650308, - "grad_norm": 2.9397857189178467, - "learning_rate": 3.5679348125470175e-06, - "loss": 0.1541, - "step": 587 - }, - { - "epoch": 3.607361963190184, - "grad_norm": 3.2690396308898926, - "learning_rate": 3.56357618726895e-06, - "loss": 0.1575, - "step": 588 - }, - { - "epoch": 3.6134969325153374, - "grad_norm": 5.444014072418213, - "learning_rate": 3.5592136111184483e-06, - "loss": 0.8079, - "step": 589 - }, - { - "epoch": 3.6196319018404908, - "grad_norm": 3.1688313484191895, - "learning_rate": 3.554847100301199e-06, - "loss": 0.341, - "step": 590 - }, - { - "epoch": 3.625766871165644, - "grad_norm": 2.469212532043457, - "learning_rate": 3.550476671037505e-06, - "loss": 0.1625, - "step": 591 - }, - { - "epoch": 3.6319018404907975, - "grad_norm": 3.3956527709960938, - "learning_rate": 3.546102339562223e-06, - "loss": 0.199, - "step": 592 - }, - { - "epoch": 3.638036809815951, - "grad_norm": 2.7287702560424805, - "learning_rate": 3.5417241221247078e-06, - "loss": 0.1493, - "step": 593 - }, - { - "epoch": 3.644171779141104, - "grad_norm": 3.5046865940093994, - "learning_rate": 3.5373420349887477e-06, - "loss": 0.2765, - "step": 594 - }, - { - "epoch": 3.6503067484662575, - "grad_norm": 3.121476650238037, - "learning_rate": 3.5329560944325065e-06, - "loss": 0.2833, - "step": 595 - }, - { - "epoch": 3.6564417177914113, - "grad_norm": 3.276463270187378, - "learning_rate": 3.528566316748462e-06, - "loss": 0.1237, - "step": 596 - }, - { - "epoch": 3.662576687116564, - "grad_norm": 3.382840633392334, - "learning_rate": 3.524172718243347e-06, - "loss": 0.1599, - "step": 597 - }, - { - "epoch": 3.668711656441718, - "grad_norm": 4.801311492919922, - "learning_rate": 3.5197753152380854e-06, - "loss": 0.2997, - "step": 598 - }, - { - "epoch": 3.6748466257668713, - "grad_norm": 4.117336273193359, - "learning_rate": 3.515374124067736e-06, - "loss": 0.2021, - "step": 599 - }, - { - "epoch": 3.6809815950920246, - "grad_norm": 3.611438035964966, - "learning_rate": 3.5109691610814263e-06, - "loss": 0.1726, - "step": 600 - }, - { - "epoch": 3.687116564417178, - "grad_norm": 4.5179972648620605, - "learning_rate": 3.5065604426422995e-06, - "loss": 0.1377, - "step": 601 - }, - { - "epoch": 3.6932515337423313, - "grad_norm": 3.561061382293701, - "learning_rate": 3.502147985127445e-06, - "loss": 0.1497, - "step": 602 - }, - { - "epoch": 3.6993865030674846, - "grad_norm": 3.3497917652130127, - "learning_rate": 3.4977318049278443e-06, - "loss": 0.1589, - "step": 603 - }, - { - "epoch": 3.705521472392638, - "grad_norm": 3.2725470066070557, - "learning_rate": 3.4933119184483065e-06, - "loss": 0.1364, - "step": 604 - }, - { - "epoch": 3.7116564417177913, - "grad_norm": 3.228956460952759, - "learning_rate": 3.4888883421074076e-06, - "loss": 0.177, - "step": 605 - }, - { - "epoch": 3.7177914110429446, - "grad_norm": 3.7648911476135254, - "learning_rate": 3.484461092337434e-06, - "loss": 0.122, - "step": 606 - }, - { - "epoch": 3.7239263803680984, - "grad_norm": 3.5322585105895996, - "learning_rate": 3.4800301855843137e-06, - "loss": 0.2664, - "step": 607 - }, - { - "epoch": 3.7300613496932513, - "grad_norm": 2.951073169708252, - "learning_rate": 3.4755956383075613e-06, - "loss": 0.12, - "step": 608 - }, - { - "epoch": 3.736196319018405, - "grad_norm": 3.0577664375305176, - "learning_rate": 3.471157466980214e-06, - "loss": 0.3926, - "step": 609 - }, - { - "epoch": 3.7423312883435584, - "grad_norm": 4.089846134185791, - "learning_rate": 3.466715688088772e-06, - "loss": 0.6233, - "step": 610 - }, - { - "epoch": 3.7484662576687118, - "grad_norm": 3.081340789794922, - "learning_rate": 3.462270318133136e-06, - "loss": 0.2456, - "step": 611 - }, - { - "epoch": 3.754601226993865, - "grad_norm": 3.034712553024292, - "learning_rate": 3.4578213736265474e-06, - "loss": 0.2683, - "step": 612 - }, - { - "epoch": 3.7607361963190185, - "grad_norm": 3.459815740585327, - "learning_rate": 3.4533688710955255e-06, - "loss": 0.3796, - "step": 613 - }, - { - "epoch": 3.766871165644172, - "grad_norm": 3.523737907409668, - "learning_rate": 3.448912827079805e-06, - "loss": 0.3326, - "step": 614 - }, - { - "epoch": 3.773006134969325, - "grad_norm": 3.333219289779663, - "learning_rate": 3.4444532581322793e-06, - "loss": 0.206, - "step": 615 - }, - { - "epoch": 3.7791411042944785, - "grad_norm": 3.582387685775757, - "learning_rate": 3.4399901808189327e-06, - "loss": 0.244, - "step": 616 - }, - { - "epoch": 3.785276073619632, - "grad_norm": 3.4887266159057617, - "learning_rate": 3.435523611718785e-06, - "loss": 0.1796, - "step": 617 - }, - { - "epoch": 3.791411042944785, - "grad_norm": 4.89408016204834, - "learning_rate": 3.4310535674238242e-06, - "loss": 0.188, - "step": 618 - }, - { - "epoch": 3.7975460122699385, - "grad_norm": 4.338910102844238, - "learning_rate": 3.42658006453895e-06, - "loss": 0.3039, - "step": 619 - }, - { - "epoch": 3.8036809815950923, - "grad_norm": 4.107708930969238, - "learning_rate": 3.4221031196819083e-06, - "loss": 0.3383, - "step": 620 - }, - { - "epoch": 3.809815950920245, - "grad_norm": 3.698777675628662, - "learning_rate": 3.4176227494832305e-06, - "loss": 0.1721, - "step": 621 - }, - { - "epoch": 3.815950920245399, - "grad_norm": 2.6659226417541504, - "learning_rate": 3.413138970586174e-06, - "loss": 0.2211, - "step": 622 - }, - { - "epoch": 3.8220858895705523, - "grad_norm": 3.2398436069488525, - "learning_rate": 3.4086517996466574e-06, - "loss": 0.1871, - "step": 623 - }, - { - "epoch": 3.8282208588957056, - "grad_norm": 4.9128804206848145, - "learning_rate": 3.404161253333199e-06, - "loss": 0.3874, - "step": 624 - }, - { - "epoch": 3.834355828220859, - "grad_norm": 3.508789300918579, - "learning_rate": 3.3996673483268573e-06, - "loss": 0.1739, - "step": 625 - }, - { - "epoch": 3.8404907975460123, - "grad_norm": 3.3016927242279053, - "learning_rate": 3.3951701013211665e-06, - "loss": 0.274, - "step": 626 - }, - { - "epoch": 3.8466257668711656, - "grad_norm": 3.8941333293914795, - "learning_rate": 3.3906695290220736e-06, - "loss": 0.3568, - "step": 627 - }, - { - "epoch": 3.852760736196319, - "grad_norm": 3.512354850769043, - "learning_rate": 3.3861656481478816e-06, - "loss": 0.157, - "step": 628 - }, - { - "epoch": 3.8588957055214723, - "grad_norm": 3.482649326324463, - "learning_rate": 3.3816584754291814e-06, - "loss": 0.1218, - "step": 629 - }, - { - "epoch": 3.8650306748466257, - "grad_norm": 3.1490275859832764, - "learning_rate": 3.377148027608793e-06, - "loss": 0.2234, - "step": 630 - }, - { - "epoch": 3.871165644171779, - "grad_norm": 3.2172653675079346, - "learning_rate": 3.3726343214417023e-06, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 3.8773006134969323, - "grad_norm": 4.167707443237305, - "learning_rate": 3.3681173736949984e-06, - "loss": 0.1384, - "step": 632 - }, - { - "epoch": 3.883435582822086, - "grad_norm": 3.4743919372558594, - "learning_rate": 3.3635972011478134e-06, - "loss": 0.3807, - "step": 633 - }, - { - "epoch": 3.889570552147239, - "grad_norm": 3.6892173290252686, - "learning_rate": 3.3590738205912566e-06, - "loss": 0.194, - "step": 634 - }, - { - "epoch": 3.895705521472393, - "grad_norm": 3.262967824935913, - "learning_rate": 3.354547248828356e-06, - "loss": 0.202, - "step": 635 - }, - { - "epoch": 3.901840490797546, - "grad_norm": 3.8871562480926514, - "learning_rate": 3.3500175026739916e-06, - "loss": 0.2471, - "step": 636 - }, - { - "epoch": 3.9079754601226995, - "grad_norm": 3.5097084045410156, - "learning_rate": 3.3454845989548385e-06, - "loss": 0.1112, - "step": 637 - }, - { - "epoch": 3.914110429447853, - "grad_norm": 4.163944721221924, - "learning_rate": 3.3409485545092995e-06, - "loss": 0.3368, - "step": 638 - }, - { - "epoch": 3.920245398773006, - "grad_norm": 3.6405045986175537, - "learning_rate": 3.336409386187444e-06, - "loss": 0.1863, - "step": 639 - }, - { - "epoch": 3.9263803680981595, - "grad_norm": 3.2477526664733887, - "learning_rate": 3.331867110850946e-06, - "loss": 0.1491, - "step": 640 - }, - { - "epoch": 3.932515337423313, - "grad_norm": 3.933753490447998, - "learning_rate": 3.327321745373021e-06, - "loss": 0.2484, - "step": 641 - }, - { - "epoch": 3.938650306748466, - "grad_norm": 3.2475059032440186, - "learning_rate": 3.322773306638364e-06, - "loss": 0.2126, - "step": 642 - }, - { - "epoch": 3.9447852760736195, - "grad_norm": 2.628467321395874, - "learning_rate": 3.318221811543086e-06, - "loss": 0.1649, - "step": 643 - }, - { - "epoch": 3.950920245398773, - "grad_norm": 3.2612411975860596, - "learning_rate": 3.313667276994651e-06, - "loss": 0.1442, - "step": 644 - }, - { - "epoch": 3.957055214723926, - "grad_norm": 3.8058395385742188, - "learning_rate": 3.309109719911814e-06, - "loss": 0.359, - "step": 645 - }, - { - "epoch": 3.96319018404908, - "grad_norm": 3.3450071811676025, - "learning_rate": 3.304549157224558e-06, - "loss": 0.4042, - "step": 646 - }, - { - "epoch": 3.969325153374233, - "grad_norm": 3.079601287841797, - "learning_rate": 3.299985605874031e-06, - "loss": 0.1699, - "step": 647 - }, - { - "epoch": 3.9754601226993866, - "grad_norm": 3.8963980674743652, - "learning_rate": 3.295419082812483e-06, - "loss": 0.1888, - "step": 648 - }, - { - "epoch": 3.98159509202454, - "grad_norm": 3.307405948638916, - "learning_rate": 3.2908496050032024e-06, - "loss": 0.2824, - "step": 649 - }, - { - "epoch": 3.9877300613496933, - "grad_norm": 3.227478265762329, - "learning_rate": 3.2862771894204544e-06, - "loss": 0.3038, - "step": 650 - }, - { - "epoch": 3.9938650306748467, - "grad_norm": 4.046506881713867, - "learning_rate": 3.2817018530494164e-06, - "loss": 0.3266, - "step": 651 - }, - { - "epoch": 4.0, - "grad_norm": 7.775874614715576, - "learning_rate": 3.277123612886116e-06, - "loss": 0.2998, - "step": 652 - }, - { - "epoch": 4.006134969325154, - "grad_norm": 3.146462917327881, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2764, - "step": 653 - }, - { - "epoch": 4.012269938650307, - "grad_norm": 3.0539863109588623, - "learning_rate": 3.2679584892207118e-06, - "loss": 0.1157, - "step": 654 - }, - { - "epoch": 4.0184049079754605, - "grad_norm": 3.634021520614624, - "learning_rate": 3.263371639764343e-06, - "loss": 0.0707, - "step": 655 - }, - { - "epoch": 4.024539877300613, - "grad_norm": 3.3474650382995605, - "learning_rate": 3.2587819546070596e-06, - "loss": 0.1067, - "step": 656 - }, - { - "epoch": 4.030674846625767, - "grad_norm": 4.409244537353516, - "learning_rate": 3.254189450798189e-06, - "loss": 0.0564, - "step": 657 - }, - { - "epoch": 4.03680981595092, - "grad_norm": 3.0446252822875977, - "learning_rate": 3.2495941453975312e-06, - "loss": 0.0535, - "step": 658 - }, - { - "epoch": 4.042944785276074, - "grad_norm": 4.014753818511963, - "learning_rate": 3.2449960554752935e-06, - "loss": 0.1245, - "step": 659 - }, - { - "epoch": 4.049079754601227, - "grad_norm": 3.188062906265259, - "learning_rate": 3.240395198112026e-06, - "loss": 0.0626, - "step": 660 - }, - { - "epoch": 4.0552147239263805, - "grad_norm": 3.006086826324463, - "learning_rate": 3.2357915903985605e-06, - "loss": 0.1198, - "step": 661 - }, - { - "epoch": 4.061349693251533, - "grad_norm": 2.8865551948547363, - "learning_rate": 3.2311852494359423e-06, - "loss": 0.0454, - "step": 662 - }, - { - "epoch": 4.067484662576687, - "grad_norm": 4.2888007164001465, - "learning_rate": 3.226576192335373e-06, - "loss": 0.2064, - "step": 663 - }, - { - "epoch": 4.07361963190184, - "grad_norm": 3.1414525508880615, - "learning_rate": 3.2219644362181436e-06, - "loss": 0.2183, - "step": 664 - }, - { - "epoch": 4.079754601226994, - "grad_norm": 2.556277275085449, - "learning_rate": 3.21734999821557e-06, - "loss": 0.0516, - "step": 665 - }, - { - "epoch": 4.085889570552148, - "grad_norm": 2.698118209838867, - "learning_rate": 3.2127328954689307e-06, - "loss": 0.0613, - "step": 666 - }, - { - "epoch": 4.0920245398773005, - "grad_norm": 2.869919538497925, - "learning_rate": 3.2081131451294025e-06, - "loss": 0.0583, - "step": 667 - }, - { - "epoch": 4.098159509202454, - "grad_norm": 3.8786919116973877, - "learning_rate": 3.2034907643579988e-06, - "loss": 0.0766, - "step": 668 - }, - { - "epoch": 4.104294478527607, - "grad_norm": 4.224637031555176, - "learning_rate": 3.1988657703255043e-06, - "loss": 0.1099, - "step": 669 - }, - { - "epoch": 4.110429447852761, - "grad_norm": 4.671669006347656, - "learning_rate": 3.194238180212409e-06, - "loss": 0.1663, - "step": 670 - }, - { - "epoch": 4.116564417177914, - "grad_norm": 3.2484257221221924, - "learning_rate": 3.1896080112088477e-06, - "loss": 0.0587, - "step": 671 - }, - { - "epoch": 4.122699386503068, - "grad_norm": 2.4808075428009033, - "learning_rate": 3.184975280514536e-06, - "loss": 0.0579, - "step": 672 - }, - { - "epoch": 4.128834355828221, - "grad_norm": 3.7106919288635254, - "learning_rate": 3.1803400053387044e-06, - "loss": 0.1083, - "step": 673 - }, - { - "epoch": 4.134969325153374, - "grad_norm": 3.008970260620117, - "learning_rate": 3.175702202900036e-06, - "loss": 0.1355, - "step": 674 - }, - { - "epoch": 4.141104294478527, - "grad_norm": 3.2640793323516846, - "learning_rate": 3.1710618904266006e-06, - "loss": 0.092, - "step": 675 - }, - { - "epoch": 4.147239263803681, - "grad_norm": 3.08042049407959, - "learning_rate": 3.166419085155793e-06, - "loss": 0.0563, - "step": 676 - }, - { - "epoch": 4.153374233128835, - "grad_norm": 2.993530511856079, - "learning_rate": 3.1617738043342695e-06, - "loss": 0.1773, - "step": 677 - }, - { - "epoch": 4.159509202453988, - "grad_norm": 2.6218204498291016, - "learning_rate": 3.157126065217879e-06, - "loss": 0.0489, - "step": 678 - }, - { - "epoch": 4.1656441717791415, - "grad_norm": 4.3173723220825195, - "learning_rate": 3.152475885071606e-06, - "loss": 0.1333, - "step": 679 - }, - { - "epoch": 4.171779141104294, - "grad_norm": 3.659149408340454, - "learning_rate": 3.147823281169498e-06, - "loss": 0.1501, - "step": 680 - }, - { - "epoch": 4.177914110429448, - "grad_norm": 3.0953338146209717, - "learning_rate": 3.143168270794612e-06, - "loss": 0.1067, - "step": 681 - }, - { - "epoch": 4.184049079754601, - "grad_norm": 3.5693907737731934, - "learning_rate": 3.1385108712389394e-06, - "loss": 0.2499, - "step": 682 - }, - { - "epoch": 4.190184049079755, - "grad_norm": 3.3022868633270264, - "learning_rate": 3.1338510998033484e-06, - "loss": 0.1748, - "step": 683 - }, - { - "epoch": 4.196319018404908, - "grad_norm": 3.7468113899230957, - "learning_rate": 3.129188973797519e-06, - "loss": 0.201, - "step": 684 - }, - { - "epoch": 4.2024539877300615, - "grad_norm": 2.8381078243255615, - "learning_rate": 3.124524510539875e-06, - "loss": 0.0735, - "step": 685 - }, - { - "epoch": 4.208588957055214, - "grad_norm": 2.84706974029541, - "learning_rate": 3.119857727357527e-06, - "loss": 0.1806, - "step": 686 - }, - { - "epoch": 4.214723926380368, - "grad_norm": 3.8130292892456055, - "learning_rate": 3.1151886415861993e-06, - "loss": 0.1811, - "step": 687 - }, - { - "epoch": 4.220858895705521, - "grad_norm": 3.528895378112793, - "learning_rate": 3.1105172705701708e-06, - "loss": 0.1634, - "step": 688 - }, - { - "epoch": 4.226993865030675, - "grad_norm": 5.028727054595947, - "learning_rate": 3.1058436316622103e-06, - "loss": 0.1625, - "step": 689 - }, - { - "epoch": 4.233128834355828, - "grad_norm": 4.606889247894287, - "learning_rate": 3.1011677422235093e-06, - "loss": 0.1791, - "step": 690 - }, - { - "epoch": 4.2392638036809815, - "grad_norm": 3.3620636463165283, - "learning_rate": 3.0964896196236217e-06, - "loss": 0.2233, - "step": 691 - }, - { - "epoch": 4.245398773006135, - "grad_norm": 3.7845852375030518, - "learning_rate": 3.0918092812403954e-06, - "loss": 0.1142, - "step": 692 - }, - { - "epoch": 4.251533742331288, - "grad_norm": 3.1204118728637695, - "learning_rate": 3.0871267444599098e-06, - "loss": 0.096, - "step": 693 - }, - { - "epoch": 4.257668711656442, - "grad_norm": 3.686067819595337, - "learning_rate": 3.0824420266764093e-06, - "loss": 0.2749, - "step": 694 - }, - { - "epoch": 4.263803680981595, - "grad_norm": 3.1680829524993896, - "learning_rate": 3.077755145292243e-06, - "loss": 0.2504, - "step": 695 - }, - { - "epoch": 4.269938650306749, - "grad_norm": 3.3179469108581543, - "learning_rate": 3.0730661177177957e-06, - "loss": 0.1324, - "step": 696 - }, - { - "epoch": 4.276073619631902, - "grad_norm": 3.1186370849609375, - "learning_rate": 3.0683749613714238e-06, - "loss": 0.0691, - "step": 697 - }, - { - "epoch": 4.282208588957055, - "grad_norm": 3.086834192276001, - "learning_rate": 3.063681693679391e-06, - "loss": 0.1026, - "step": 698 - }, - { - "epoch": 4.288343558282208, - "grad_norm": 4.629584312438965, - "learning_rate": 3.0589863320758063e-06, - "loss": 0.2646, - "step": 699 - }, - { - "epoch": 4.294478527607362, - "grad_norm": 3.9641213417053223, - "learning_rate": 3.0542888940025562e-06, - "loss": 0.1711, - "step": 700 - }, - { - "epoch": 4.300613496932515, - "grad_norm": 3.75014328956604, - "learning_rate": 3.0495893969092395e-06, - "loss": 0.0589, - "step": 701 - }, - { - "epoch": 4.306748466257669, - "grad_norm": 3.603290319442749, - "learning_rate": 3.044887858253105e-06, - "loss": 0.2244, - "step": 702 - }, - { - "epoch": 4.3128834355828225, - "grad_norm": 3.79404616355896, - "learning_rate": 3.040184295498984e-06, - "loss": 0.1506, - "step": 703 - }, - { - "epoch": 4.319018404907975, - "grad_norm": 3.0890021324157715, - "learning_rate": 3.035478726119228e-06, - "loss": 0.2343, - "step": 704 - }, - { - "epoch": 4.325153374233129, - "grad_norm": 3.6688191890716553, - "learning_rate": 3.0307711675936426e-06, - "loss": 0.0518, - "step": 705 - }, - { - "epoch": 4.331288343558282, - "grad_norm": 5.1836700439453125, - "learning_rate": 3.0260616374094208e-06, - "loss": 0.2363, - "step": 706 - }, - { - "epoch": 4.337423312883436, - "grad_norm": 2.7123284339904785, - "learning_rate": 3.0213501530610807e-06, - "loss": 0.0848, - "step": 707 - }, - { - "epoch": 4.343558282208589, - "grad_norm": 3.5661890506744385, - "learning_rate": 3.0166367320504005e-06, - "loss": 0.149, - "step": 708 - }, - { - "epoch": 4.3496932515337425, - "grad_norm": 3.6454737186431885, - "learning_rate": 3.0119213918863515e-06, - "loss": 0.1133, - "step": 709 - }, - { - "epoch": 4.355828220858895, - "grad_norm": 3.7534968852996826, - "learning_rate": 3.0072041500850343e-06, - "loss": 0.1358, - "step": 710 - }, - { - "epoch": 4.361963190184049, - "grad_norm": 3.40387225151062, - "learning_rate": 3.0024850241696128e-06, - "loss": 0.0706, - "step": 711 - }, - { - "epoch": 4.368098159509202, - "grad_norm": 3.250471591949463, - "learning_rate": 2.9977640316702512e-06, - "loss": 0.1977, - "step": 712 - }, - { - "epoch": 4.374233128834356, - "grad_norm": 3.417781352996826, - "learning_rate": 2.993041190124047e-06, - "loss": 0.2622, - "step": 713 - }, - { - "epoch": 4.38036809815951, - "grad_norm": 2.628434181213379, - "learning_rate": 2.9883165170749657e-06, - "loss": 0.1487, - "step": 714 - }, - { - "epoch": 4.386503067484663, - "grad_norm": 3.240264892578125, - "learning_rate": 2.9835900300737763e-06, - "loss": 0.0822, - "step": 715 - }, - { - "epoch": 4.392638036809816, - "grad_norm": 6.575517177581787, - "learning_rate": 2.9788617466779884e-06, - "loss": 0.3668, - "step": 716 - }, - { - "epoch": 4.398773006134969, - "grad_norm": 4.699089050292969, - "learning_rate": 2.974131684451781e-06, - "loss": 0.2432, - "step": 717 - }, - { - "epoch": 4.404907975460123, - "grad_norm": 2.9815752506256104, - "learning_rate": 2.9693998609659443e-06, - "loss": 0.0689, - "step": 718 - }, - { - "epoch": 4.411042944785276, - "grad_norm": 4.192755222320557, - "learning_rate": 2.9646662937978082e-06, - "loss": 0.1897, - "step": 719 - }, - { - "epoch": 4.41717791411043, - "grad_norm": 2.9729068279266357, - "learning_rate": 2.9599310005311824e-06, - "loss": 0.0457, - "step": 720 - }, - { - "epoch": 4.423312883435583, - "grad_norm": 4.234438896179199, - "learning_rate": 2.9551939987562866e-06, - "loss": 0.2307, - "step": 721 - }, - { - "epoch": 4.429447852760736, - "grad_norm": 3.3982434272766113, - "learning_rate": 2.950455306069688e-06, - "loss": 0.0637, - "step": 722 - }, - { - "epoch": 4.435582822085889, - "grad_norm": 4.539764404296875, - "learning_rate": 2.9457149400742357e-06, - "loss": 0.1924, - "step": 723 - }, - { - "epoch": 4.441717791411043, - "grad_norm": 4.039684772491455, - "learning_rate": 2.940972918378993e-06, - "loss": 0.1275, - "step": 724 - }, - { - "epoch": 4.447852760736196, - "grad_norm": 4.340360641479492, - "learning_rate": 2.936229258599174e-06, - "loss": 0.123, - "step": 725 - }, - { - "epoch": 4.45398773006135, - "grad_norm": 2.8720109462738037, - "learning_rate": 2.93148397835608e-06, - "loss": 0.0555, - "step": 726 - }, - { - "epoch": 4.460122699386503, - "grad_norm": 4.227811336517334, - "learning_rate": 2.926737095277029e-06, - "loss": 0.0991, - "step": 727 - }, - { - "epoch": 4.466257668711656, - "grad_norm": 2.8079142570495605, - "learning_rate": 2.921988626995295e-06, - "loss": 0.0628, - "step": 728 - }, - { - "epoch": 4.47239263803681, - "grad_norm": 4.195122241973877, - "learning_rate": 2.9172385911500385e-06, - "loss": 0.2333, - "step": 729 - }, - { - "epoch": 4.478527607361963, - "grad_norm": 3.223794460296631, - "learning_rate": 2.9124870053862447e-06, - "loss": 0.1317, - "step": 730 - }, - { - "epoch": 4.484662576687117, - "grad_norm": 3.5533759593963623, - "learning_rate": 2.907733887354657e-06, - "loss": 0.2285, - "step": 731 - }, - { - "epoch": 4.49079754601227, - "grad_norm": 3.535673141479492, - "learning_rate": 2.9029792547117088e-06, - "loss": 0.096, - "step": 732 - }, - { - "epoch": 4.4969325153374236, - "grad_norm": 4.031703948974609, - "learning_rate": 2.898223125119461e-06, - "loss": 0.1505, - "step": 733 - }, - { - "epoch": 4.5030674846625764, - "grad_norm": 2.823413610458374, - "learning_rate": 2.893465516245534e-06, - "loss": 0.0327, - "step": 734 - }, - { - "epoch": 4.50920245398773, - "grad_norm": 3.516738176345825, - "learning_rate": 2.8887064457630453e-06, - "loss": 0.0743, - "step": 735 - }, - { - "epoch": 4.515337423312883, - "grad_norm": 3.5523500442504883, - "learning_rate": 2.8839459313505407e-06, - "loss": 0.1768, - "step": 736 - }, - { - "epoch": 4.521472392638037, - "grad_norm": 3.2433223724365234, - "learning_rate": 2.879183990691929e-06, - "loss": 0.1598, - "step": 737 - }, - { - "epoch": 4.52760736196319, - "grad_norm": 3.0156848430633545, - "learning_rate": 2.8744206414764185e-06, - "loss": 0.0829, - "step": 738 - }, - { - "epoch": 4.533742331288344, - "grad_norm": 4.359529495239258, - "learning_rate": 2.8696559013984488e-06, - "loss": 0.1169, - "step": 739 - }, - { - "epoch": 4.539877300613497, - "grad_norm": 2.3862433433532715, - "learning_rate": 2.8648897881576274e-06, - "loss": 0.0962, - "step": 740 - }, - { - "epoch": 4.54601226993865, - "grad_norm": 2.7100136280059814, - "learning_rate": 2.8601223194586613e-06, - "loss": 0.1204, - "step": 741 - }, - { - "epoch": 4.552147239263804, - "grad_norm": 3.8116140365600586, - "learning_rate": 2.8553535130112935e-06, - "loss": 0.0685, - "step": 742 - }, - { - "epoch": 4.558282208588957, - "grad_norm": 2.9640142917633057, - "learning_rate": 2.850583386530235e-06, - "loss": 0.0692, - "step": 743 - }, - { - "epoch": 4.564417177914111, - "grad_norm": 3.264592170715332, - "learning_rate": 2.8458119577351035e-06, - "loss": 0.2128, - "step": 744 - }, - { - "epoch": 4.570552147239264, - "grad_norm": 3.230497360229492, - "learning_rate": 2.841039244350351e-06, - "loss": 0.2409, - "step": 745 - }, - { - "epoch": 4.576687116564417, - "grad_norm": 4.41513204574585, - "learning_rate": 2.8362652641052024e-06, - "loss": 0.1878, - "step": 746 - }, - { - "epoch": 4.58282208588957, - "grad_norm": 3.047248601913452, - "learning_rate": 2.83149003473359e-06, - "loss": 0.1303, - "step": 747 - }, - { - "epoch": 4.588957055214724, - "grad_norm": 2.399754047393799, - "learning_rate": 2.8267135739740836e-06, - "loss": 0.0577, - "step": 748 - }, - { - "epoch": 4.595092024539877, - "grad_norm": 4.608038425445557, - "learning_rate": 2.8219358995698307e-06, - "loss": 0.2329, - "step": 749 - }, - { - "epoch": 4.601226993865031, - "grad_norm": 3.537644147872925, - "learning_rate": 2.8171570292684846e-06, - "loss": 0.1329, - "step": 750 - }, - { - "epoch": 4.6073619631901845, - "grad_norm": 2.8099827766418457, - "learning_rate": 2.8123769808221407e-06, - "loss": 0.1512, - "step": 751 - }, - { - "epoch": 4.613496932515337, - "grad_norm": 3.3169758319854736, - "learning_rate": 2.8075957719872724e-06, - "loss": 0.1267, - "step": 752 - }, - { - "epoch": 4.61963190184049, - "grad_norm": 3.578435182571411, - "learning_rate": 2.8028134205246633e-06, - "loss": 0.147, - "step": 753 - }, - { - "epoch": 4.625766871165644, - "grad_norm": 3.544437885284424, - "learning_rate": 2.7980299441993415e-06, - "loss": 0.0947, - "step": 754 - }, - { - "epoch": 4.631901840490798, - "grad_norm": 3.798776388168335, - "learning_rate": 2.793245360780512e-06, - "loss": 0.1498, - "step": 755 - }, - { - "epoch": 4.638036809815951, - "grad_norm": 3.634991407394409, - "learning_rate": 2.788459688041495e-06, - "loss": 0.2504, - "step": 756 - }, - { - "epoch": 4.644171779141105, - "grad_norm": 20.123680114746094, - "learning_rate": 2.783672943759655e-06, - "loss": 0.2091, - "step": 757 - }, - { - "epoch": 4.6503067484662575, - "grad_norm": 3.9357221126556396, - "learning_rate": 2.778885145716339e-06, - "loss": 0.2045, - "step": 758 - }, - { - "epoch": 4.656441717791411, - "grad_norm": 3.3035309314727783, - "learning_rate": 2.7740963116968063e-06, - "loss": 0.1416, - "step": 759 - }, - { - "epoch": 4.662576687116564, - "grad_norm": 3.096985101699829, - "learning_rate": 2.7693064594901646e-06, - "loss": 0.0455, - "step": 760 - }, - { - "epoch": 4.668711656441718, - "grad_norm": 2.9855458736419678, - "learning_rate": 2.7645156068893075e-06, - "loss": 0.1496, - "step": 761 - }, - { - "epoch": 4.674846625766871, - "grad_norm": 3.9140093326568604, - "learning_rate": 2.759723771690839e-06, - "loss": 0.2061, - "step": 762 - }, - { - "epoch": 4.680981595092025, - "grad_norm": 3.590569496154785, - "learning_rate": 2.754930971695019e-06, - "loss": 0.1017, - "step": 763 - }, - { - "epoch": 4.6871165644171775, - "grad_norm": 3.527254581451416, - "learning_rate": 2.750137224705687e-06, - "loss": 0.1979, - "step": 764 - }, - { - "epoch": 4.693251533742331, - "grad_norm": 4.198459148406982, - "learning_rate": 2.745342548530202e-06, - "loss": 0.1667, - "step": 765 - }, - { - "epoch": 4.699386503067485, - "grad_norm": 2.0246167182922363, - "learning_rate": 2.7405469609793746e-06, - "loss": 0.0346, - "step": 766 - }, - { - "epoch": 4.705521472392638, - "grad_norm": 3.2045300006866455, - "learning_rate": 2.7357504798674004e-06, - "loss": 0.0596, - "step": 767 - }, - { - "epoch": 4.711656441717792, - "grad_norm": 2.736985921859741, - "learning_rate": 2.730953123011796e-06, - "loss": 0.0384, - "step": 768 - }, - { - "epoch": 4.717791411042945, - "grad_norm": 3.0621395111083984, - "learning_rate": 2.726154908233328e-06, - "loss": 0.0558, - "step": 769 - }, - { - "epoch": 4.723926380368098, - "grad_norm": 3.2280497550964355, - "learning_rate": 2.721355853355953e-06, - "loss": 0.2272, - "step": 770 - }, - { - "epoch": 4.730061349693251, - "grad_norm": 3.342226028442383, - "learning_rate": 2.716555976206748e-06, - "loss": 0.074, - "step": 771 - }, - { - "epoch": 4.736196319018405, - "grad_norm": 4.328624248504639, - "learning_rate": 2.7117552946158415e-06, - "loss": 0.1034, - "step": 772 - }, - { - "epoch": 4.742331288343558, - "grad_norm": 2.980215311050415, - "learning_rate": 2.706953826416353e-06, - "loss": 0.1199, - "step": 773 - }, - { - "epoch": 4.748466257668712, - "grad_norm": 2.622478485107422, - "learning_rate": 2.702151589444324e-06, - "loss": 0.0467, - "step": 774 - }, - { - "epoch": 4.754601226993865, - "grad_norm": 2.9958693981170654, - "learning_rate": 2.6973486015386507e-06, - "loss": 0.143, - "step": 775 - }, - { - "epoch": 4.7607361963190185, - "grad_norm": 4.548511505126953, - "learning_rate": 2.6925448805410197e-06, - "loss": 0.3594, - "step": 776 - }, - { - "epoch": 4.766871165644172, - "grad_norm": 3.3429481983184814, - "learning_rate": 2.6877404442958393e-06, - "loss": 0.1397, - "step": 777 - }, - { - "epoch": 4.773006134969325, - "grad_norm": 2.5820136070251465, - "learning_rate": 2.682935310650177e-06, - "loss": 0.054, - "step": 778 - }, - { - "epoch": 4.779141104294479, - "grad_norm": 4.047626495361328, - "learning_rate": 2.6781294974536886e-06, - "loss": 0.1284, - "step": 779 - }, - { - "epoch": 4.785276073619632, - "grad_norm": 3.0227510929107666, - "learning_rate": 2.673323022558557e-06, - "loss": 0.1441, - "step": 780 - }, - { - "epoch": 4.791411042944786, - "grad_norm": 4.731313705444336, - "learning_rate": 2.6685159038194202e-06, - "loss": 0.2859, - "step": 781 - }, - { - "epoch": 4.7975460122699385, - "grad_norm": 3.880655288696289, - "learning_rate": 2.6637081590933096e-06, - "loss": 0.1524, - "step": 782 - }, - { - "epoch": 4.803680981595092, - "grad_norm": 2.375474452972412, - "learning_rate": 2.6588998062395803e-06, - "loss": 0.0338, - "step": 783 - }, - { - "epoch": 4.809815950920245, - "grad_norm": 3.3587446212768555, - "learning_rate": 2.6540908631198498e-06, - "loss": 0.0755, - "step": 784 - }, - { - "epoch": 4.815950920245399, - "grad_norm": 2.767686367034912, - "learning_rate": 2.6492813475979243e-06, - "loss": 0.0631, - "step": 785 - }, - { - "epoch": 4.822085889570552, - "grad_norm": 3.88670015335083, - "learning_rate": 2.6444712775397397e-06, - "loss": 0.0853, - "step": 786 - }, - { - "epoch": 4.828220858895706, - "grad_norm": 3.543276309967041, - "learning_rate": 2.639660670813288e-06, - "loss": 0.1895, - "step": 787 - }, - { - "epoch": 4.8343558282208585, - "grad_norm": 3.659323215484619, - "learning_rate": 2.6348495452885598e-06, - "loss": 0.1745, - "step": 788 - }, - { - "epoch": 4.840490797546012, - "grad_norm": 3.0955021381378174, - "learning_rate": 2.630037918837468e-06, - "loss": 0.0846, - "step": 789 - }, - { - "epoch": 4.846625766871165, - "grad_norm": 3.4473249912261963, - "learning_rate": 2.6252258093337892e-06, - "loss": 0.0808, - "step": 790 - }, - { - "epoch": 4.852760736196319, - "grad_norm": 3.937120199203491, - "learning_rate": 2.6204132346530936e-06, - "loss": 0.2054, - "step": 791 - }, - { - "epoch": 4.858895705521473, - "grad_norm": 4.052806854248047, - "learning_rate": 2.6156002126726788e-06, - "loss": 0.1679, - "step": 792 - }, - { - "epoch": 4.865030674846626, - "grad_norm": 2.6694889068603516, - "learning_rate": 2.6107867612715043e-06, - "loss": 0.0534, - "step": 793 - }, - { - "epoch": 4.871165644171779, - "grad_norm": 3.594649076461792, - "learning_rate": 2.6059728983301267e-06, - "loss": 0.0899, - "step": 794 - }, - { - "epoch": 4.877300613496932, - "grad_norm": 2.7796030044555664, - "learning_rate": 2.601158641730629e-06, - "loss": 0.0596, - "step": 795 - }, - { - "epoch": 4.883435582822086, - "grad_norm": 4.618961334228516, - "learning_rate": 2.5963440093565567e-06, - "loss": 0.3858, - "step": 796 - }, - { - "epoch": 4.889570552147239, - "grad_norm": 3.0783939361572266, - "learning_rate": 2.5915290190928518e-06, - "loss": 0.12, - "step": 797 - }, - { - "epoch": 4.895705521472393, - "grad_norm": 4.078456878662109, - "learning_rate": 2.586713688825786e-06, - "loss": 0.1278, - "step": 798 - }, - { - "epoch": 4.901840490797546, - "grad_norm": 2.9439120292663574, - "learning_rate": 2.5818980364428935e-06, - "loss": 0.0847, - "step": 799 - }, - { - "epoch": 4.9079754601226995, - "grad_norm": 5.140681743621826, - "learning_rate": 2.5770820798329055e-06, - "loss": 0.1718, - "step": 800 - }, - { - "epoch": 4.914110429447852, - "grad_norm": 3.450190305709839, - "learning_rate": 2.572265836885682e-06, - "loss": 0.0895, - "step": 801 - }, - { - "epoch": 4.920245398773006, - "grad_norm": 3.1145224571228027, - "learning_rate": 2.567449325492149e-06, - "loss": 0.0652, - "step": 802 - }, - { - "epoch": 4.92638036809816, - "grad_norm": 2.851768732070923, - "learning_rate": 2.5626325635442283e-06, - "loss": 0.0877, - "step": 803 - }, - { - "epoch": 4.932515337423313, - "grad_norm": 3.3392980098724365, - "learning_rate": 2.5578155689347716e-06, - "loss": 0.2028, - "step": 804 - }, - { - "epoch": 4.938650306748467, - "grad_norm": 3.012439250946045, - "learning_rate": 2.5529983595574964e-06, - "loss": 0.031, - "step": 805 - }, - { - "epoch": 4.9447852760736195, - "grad_norm": 2.7732717990875244, - "learning_rate": 2.548180953306918e-06, - "loss": 0.0415, - "step": 806 - }, - { - "epoch": 4.950920245398773, - "grad_norm": 3.0423903465270996, - "learning_rate": 2.5433633680782817e-06, - "loss": 0.1188, - "step": 807 - }, - { - "epoch": 4.957055214723926, - "grad_norm": 5.056387901306152, - "learning_rate": 2.538545621767498e-06, - "loss": 0.1703, - "step": 808 - }, - { - "epoch": 4.96319018404908, - "grad_norm": 4.052585124969482, - "learning_rate": 2.533727732271077e-06, - "loss": 0.1455, - "step": 809 - }, - { - "epoch": 4.969325153374233, - "grad_norm": 3.4507904052734375, - "learning_rate": 2.5289097174860593e-06, - "loss": 0.0617, - "step": 810 - }, - { - "epoch": 4.975460122699387, - "grad_norm": 2.908266305923462, - "learning_rate": 2.524091595309952e-06, - "loss": 0.1173, - "step": 811 - }, - { - "epoch": 4.9815950920245395, - "grad_norm": 2.5857458114624023, - "learning_rate": 2.519273383640661e-06, - "loss": 0.0538, - "step": 812 - }, - { - "epoch": 4.987730061349693, - "grad_norm": 3.3518428802490234, - "learning_rate": 2.5144551003764227e-06, - "loss": 0.211, - "step": 813 - }, - { - "epoch": 4.993865030674847, - "grad_norm": 3.137981653213501, - "learning_rate": 2.509636763415742e-06, - "loss": 0.0944, - "step": 814 - }, - { - "epoch": 5.0, - "grad_norm": 2.8854241371154785, - "learning_rate": 2.5048183906573227e-06, - "loss": 0.098, - "step": 815 - }, - { - "epoch": 5.006134969325154, - "grad_norm": 3.508527994155884, - "learning_rate": 2.5e-06, - "loss": 0.1102, - "step": 816 - }, - { - "epoch": 5.012269938650307, - "grad_norm": 2.448152542114258, - "learning_rate": 2.495181609342678e-06, - "loss": 0.0712, - "step": 817 - }, - { - "epoch": 5.0184049079754605, - "grad_norm": 3.105818748474121, - "learning_rate": 2.4903632365842587e-06, - "loss": 0.0414, - "step": 818 - }, - { - "epoch": 5.024539877300613, - "grad_norm": 3.8048601150512695, - "learning_rate": 2.4855448996235777e-06, - "loss": 0.0894, - "step": 819 - }, - { - "epoch": 5.030674846625767, - "grad_norm": 3.259834051132202, - "learning_rate": 2.48072661635934e-06, - "loss": 0.0796, - "step": 820 - }, - { - "epoch": 5.03680981595092, - "grad_norm": 2.822364568710327, - "learning_rate": 2.475908404690049e-06, - "loss": 0.0349, - "step": 821 - }, - { - "epoch": 5.042944785276074, - "grad_norm": 4.78808069229126, - "learning_rate": 2.4710902825139415e-06, - "loss": 0.2529, - "step": 822 - }, - { - "epoch": 5.049079754601227, - "grad_norm": 3.5420572757720947, - "learning_rate": 2.466272267728924e-06, - "loss": 0.1405, - "step": 823 - }, - { - "epoch": 5.0552147239263805, - "grad_norm": 2.500713348388672, - "learning_rate": 2.461454378232503e-06, - "loss": 0.0408, - "step": 824 - }, - { - "epoch": 5.061349693251533, - "grad_norm": 3.266291618347168, - "learning_rate": 2.4566366319217196e-06, - "loss": 0.0338, - "step": 825 - }, - { - "epoch": 5.067484662576687, - "grad_norm": 4.071012020111084, - "learning_rate": 2.4518190466930837e-06, - "loss": 0.06, - "step": 826 - }, - { - "epoch": 5.07361963190184, - "grad_norm": 4.3747172355651855, - "learning_rate": 2.4470016404425045e-06, - "loss": 0.1184, - "step": 827 - }, - { - "epoch": 5.079754601226994, - "grad_norm": 3.92030668258667, - "learning_rate": 2.4421844310652296e-06, - "loss": 0.1369, - "step": 828 - }, - { - "epoch": 5.085889570552148, - "grad_norm": 3.3482303619384766, - "learning_rate": 2.437367436455773e-06, - "loss": 0.1166, - "step": 829 - }, - { - "epoch": 5.0920245398773005, - "grad_norm": 3.429368019104004, - "learning_rate": 2.4325506745078524e-06, - "loss": 0.1214, - "step": 830 - }, - { - "epoch": 5.098159509202454, - "grad_norm": 3.4915647506713867, - "learning_rate": 2.427734163114319e-06, - "loss": 0.0454, - "step": 831 - }, - { - "epoch": 5.104294478527607, - "grad_norm": 3.1721251010894775, - "learning_rate": 2.4229179201670954e-06, - "loss": 0.0431, - "step": 832 - }, - { - "epoch": 5.110429447852761, - "grad_norm": 2.552578926086426, - "learning_rate": 2.418101963557107e-06, - "loss": 0.0347, - "step": 833 - }, - { - "epoch": 5.116564417177914, - "grad_norm": 3.518169403076172, - "learning_rate": 2.413286311174214e-06, - "loss": 0.1555, - "step": 834 - }, - { - "epoch": 5.122699386503068, - "grad_norm": 2.4452908039093018, - "learning_rate": 2.4084709809071487e-06, - "loss": 0.035, - "step": 835 - }, - { - "epoch": 5.128834355828221, - "grad_norm": 3.5366528034210205, - "learning_rate": 2.403655990643444e-06, - "loss": 0.0798, - "step": 836 - }, - { - "epoch": 5.134969325153374, - "grad_norm": 2.300065040588379, - "learning_rate": 2.398841358269371e-06, - "loss": 0.0178, - "step": 837 - }, - { - "epoch": 5.141104294478527, - "grad_norm": 2.851393699645996, - "learning_rate": 2.3940271016698733e-06, - "loss": 0.0447, - "step": 838 - }, - { - "epoch": 5.147239263803681, - "grad_norm": 4.085958957672119, - "learning_rate": 2.3892132387284956e-06, - "loss": 0.1626, - "step": 839 - }, - { - "epoch": 5.153374233128835, - "grad_norm": 3.4240522384643555, - "learning_rate": 2.384399787327322e-06, - "loss": 0.0914, - "step": 840 - }, - { - "epoch": 5.159509202453988, - "grad_norm": 4.111586570739746, - "learning_rate": 2.3795867653469072e-06, - "loss": 0.0784, - "step": 841 - }, - { - "epoch": 5.1656441717791415, - "grad_norm": 2.3306312561035156, - "learning_rate": 2.374774190666211e-06, - "loss": 0.0216, - "step": 842 - }, - { - "epoch": 5.171779141104294, - "grad_norm": 2.5006275177001953, - "learning_rate": 2.3699620811625327e-06, - "loss": 0.0516, - "step": 843 - }, - { - "epoch": 5.177914110429448, - "grad_norm": 3.1680967807769775, - "learning_rate": 2.365150454711441e-06, - "loss": 0.0517, - "step": 844 - }, - { - "epoch": 5.184049079754601, - "grad_norm": 1.817044734954834, - "learning_rate": 2.3603393291867122e-06, - "loss": 0.0264, - "step": 845 - }, - { - "epoch": 5.190184049079755, - "grad_norm": 4.445211887359619, - "learning_rate": 2.355528722460261e-06, - "loss": 0.1079, - "step": 846 - }, - { - "epoch": 5.196319018404908, - "grad_norm": 2.918304681777954, - "learning_rate": 2.350718652402076e-06, - "loss": 0.0633, - "step": 847 - }, - { - "epoch": 5.2024539877300615, - "grad_norm": 3.6307432651519775, - "learning_rate": 2.345909136880151e-06, - "loss": 0.1013, - "step": 848 - }, - { - "epoch": 5.208588957055214, - "grad_norm": 3.5696842670440674, - "learning_rate": 2.34110019376042e-06, - "loss": 0.0199, - "step": 849 - }, - { - "epoch": 5.214723926380368, - "grad_norm": 2.2214856147766113, - "learning_rate": 2.336291840906691e-06, - "loss": 0.0288, - "step": 850 - }, - { - "epoch": 5.220858895705521, - "grad_norm": 2.5375778675079346, - "learning_rate": 2.3314840961805806e-06, - "loss": 0.0142, - "step": 851 - }, - { - "epoch": 5.226993865030675, - "grad_norm": 3.0093517303466797, - "learning_rate": 2.326676977441444e-06, - "loss": 0.0911, - "step": 852 - }, - { - "epoch": 5.233128834355828, - "grad_norm": 2.7067151069641113, - "learning_rate": 2.3218705025463118e-06, - "loss": 0.0315, - "step": 853 - }, - { - "epoch": 5.2392638036809815, - "grad_norm": 3.1892940998077393, - "learning_rate": 2.3170646893498237e-06, - "loss": 0.1344, - "step": 854 - }, - { - "epoch": 5.245398773006135, - "grad_norm": 2.8909313678741455, - "learning_rate": 2.312259555704161e-06, - "loss": 0.034, - "step": 855 - }, - { - "epoch": 5.251533742331288, - "grad_norm": 5.097650051116943, - "learning_rate": 2.3074551194589816e-06, - "loss": 0.1889, - "step": 856 - }, - { - "epoch": 5.257668711656442, - "grad_norm": 3.8511006832122803, - "learning_rate": 2.3026513984613506e-06, - "loss": 0.0794, - "step": 857 - }, - { - "epoch": 5.263803680981595, - "grad_norm": 2.2874133586883545, - "learning_rate": 2.297848410555677e-06, - "loss": 0.0238, - "step": 858 - }, - { - "epoch": 5.269938650306749, - "grad_norm": 3.504723310470581, - "learning_rate": 2.293046173583648e-06, - "loss": 0.0369, - "step": 859 - }, - { - "epoch": 5.276073619631902, - "grad_norm": 3.2108154296875, - "learning_rate": 2.28824470538416e-06, - "loss": 0.0677, - "step": 860 - }, - { - "epoch": 5.282208588957055, - "grad_norm": 2.2249386310577393, - "learning_rate": 2.2834440237932537e-06, - "loss": 0.0244, - "step": 861 - }, - { - "epoch": 5.288343558282208, - "grad_norm": 3.141784191131592, - "learning_rate": 2.2786441466440474e-06, - "loss": 0.0628, - "step": 862 - }, - { - "epoch": 5.294478527607362, - "grad_norm": 3.5597352981567383, - "learning_rate": 2.2738450917666727e-06, - "loss": 0.0914, - "step": 863 - }, - { - "epoch": 5.300613496932515, - "grad_norm": 2.991966962814331, - "learning_rate": 2.269046876988204e-06, - "loss": 0.0546, - "step": 864 - }, - { - "epoch": 5.306748466257669, - "grad_norm": 3.100776195526123, - "learning_rate": 2.2642495201325995e-06, - "loss": 0.0473, - "step": 865 - }, - { - "epoch": 5.3128834355828225, - "grad_norm": 2.541754722595215, - "learning_rate": 2.259453039020626e-06, - "loss": 0.0613, - "step": 866 - }, - { - "epoch": 5.319018404907975, - "grad_norm": 2.8117194175720215, - "learning_rate": 2.2546574514697985e-06, - "loss": 0.0533, - "step": 867 - }, - { - "epoch": 5.325153374233129, - "grad_norm": 2.5676379203796387, - "learning_rate": 2.249862775294313e-06, - "loss": 0.018, - "step": 868 - }, - { - "epoch": 5.331288343558282, - "grad_norm": 2.5297701358795166, - "learning_rate": 2.245069028304981e-06, - "loss": 0.0246, - "step": 869 - }, - { - "epoch": 5.337423312883436, - "grad_norm": 2.199498176574707, - "learning_rate": 2.240276228309161e-06, - "loss": 0.0551, - "step": 870 - }, - { - "epoch": 5.343558282208589, - "grad_norm": 2.5793557167053223, - "learning_rate": 2.2354843931106933e-06, - "loss": 0.0258, - "step": 871 - }, - { - "epoch": 5.3496932515337425, - "grad_norm": 3.352058172225952, - "learning_rate": 2.230693540509836e-06, - "loss": 0.0228, - "step": 872 - }, - { - "epoch": 5.355828220858895, - "grad_norm": 2.900599956512451, - "learning_rate": 2.225903688303195e-06, - "loss": 0.0586, - "step": 873 - }, - { - "epoch": 5.361963190184049, - "grad_norm": 3.3317267894744873, - "learning_rate": 2.221114854283662e-06, - "loss": 0.0733, - "step": 874 - }, - { - "epoch": 5.368098159509202, - "grad_norm": 2.79304575920105, - "learning_rate": 2.2163270562403453e-06, - "loss": 0.0251, - "step": 875 - }, - { - "epoch": 5.374233128834356, - "grad_norm": 3.8596227169036865, - "learning_rate": 2.211540311958506e-06, - "loss": 0.0957, - "step": 876 - }, - { - "epoch": 5.38036809815951, - "grad_norm": 2.7464358806610107, - "learning_rate": 2.2067546392194888e-06, - "loss": 0.0457, - "step": 877 - }, - { - "epoch": 5.386503067484663, - "grad_norm": 2.3359906673431396, - "learning_rate": 2.2019700558006598e-06, - "loss": 0.0218, - "step": 878 - }, - { - "epoch": 5.392638036809816, - "grad_norm": 3.2412452697753906, - "learning_rate": 2.197186579475337e-06, - "loss": 0.0494, - "step": 879 - }, - { - "epoch": 5.398773006134969, - "grad_norm": 3.930197238922119, - "learning_rate": 2.1924042280127284e-06, - "loss": 0.0803, - "step": 880 - }, - { - "epoch": 5.404907975460123, - "grad_norm": 2.5752930641174316, - "learning_rate": 2.1876230191778598e-06, - "loss": 0.0356, - "step": 881 - }, - { - "epoch": 5.411042944785276, - "grad_norm": 5.507393836975098, - "learning_rate": 2.182842970731516e-06, - "loss": 0.1245, - "step": 882 - }, - { - "epoch": 5.41717791411043, - "grad_norm": 2.416719436645508, - "learning_rate": 2.17806410043017e-06, - "loss": 0.0224, - "step": 883 - }, - { - "epoch": 5.423312883435583, - "grad_norm": 2.500429630279541, - "learning_rate": 2.173286426025917e-06, - "loss": 0.0499, - "step": 884 - }, - { - "epoch": 5.429447852760736, - "grad_norm": 2.8843860626220703, - "learning_rate": 2.168509965266411e-06, - "loss": 0.075, - "step": 885 - }, - { - "epoch": 5.435582822085889, - "grad_norm": 2.3187198638916016, - "learning_rate": 2.1637347358947984e-06, - "loss": 0.065, - "step": 886 - }, - { - "epoch": 5.441717791411043, - "grad_norm": 2.7135889530181885, - "learning_rate": 2.15896075564965e-06, - "loss": 0.0848, - "step": 887 - }, - { - "epoch": 5.447852760736196, - "grad_norm": 1.751846194267273, - "learning_rate": 2.1541880422648978e-06, - "loss": 0.0112, - "step": 888 - }, - { - "epoch": 5.45398773006135, - "grad_norm": 3.113271713256836, - "learning_rate": 2.1494166134697655e-06, - "loss": 0.077, - "step": 889 - }, - { - "epoch": 5.460122699386503, - "grad_norm": 2.711318016052246, - "learning_rate": 2.1446464869887077e-06, - "loss": 0.03, - "step": 890 - }, - { - "epoch": 5.466257668711656, - "grad_norm": 1.8012003898620605, - "learning_rate": 2.13987768054134e-06, - "loss": 0.0141, - "step": 891 - }, - { - "epoch": 5.47239263803681, - "grad_norm": 2.0968120098114014, - "learning_rate": 2.135110211842374e-06, - "loss": 0.0147, - "step": 892 - }, - { - "epoch": 5.478527607361963, - "grad_norm": 3.1689956188201904, - "learning_rate": 2.1303440986015525e-06, - "loss": 0.1123, - "step": 893 - }, - { - "epoch": 5.484662576687117, - "grad_norm": 4.512697219848633, - "learning_rate": 2.1255793585235827e-06, - "loss": 0.0359, - "step": 894 - }, - { - "epoch": 5.49079754601227, - "grad_norm": 3.5739688873291016, - "learning_rate": 2.120816009308071e-06, - "loss": 0.0635, - "step": 895 - }, - { - "epoch": 5.4969325153374236, - "grad_norm": 4.556554317474365, - "learning_rate": 2.1160540686494597e-06, - "loss": 0.1104, - "step": 896 - }, - { - "epoch": 5.5030674846625764, - "grad_norm": 2.2047064304351807, - "learning_rate": 2.1112935542369546e-06, - "loss": 0.0187, - "step": 897 - }, - { - "epoch": 5.50920245398773, - "grad_norm": 3.0289857387542725, - "learning_rate": 2.106534483754466e-06, - "loss": 0.0874, - "step": 898 - }, - { - "epoch": 5.515337423312883, - "grad_norm": 2.7090444564819336, - "learning_rate": 2.1017768748805396e-06, - "loss": 0.0301, - "step": 899 - }, - { - "epoch": 5.521472392638037, - "grad_norm": 3.0662643909454346, - "learning_rate": 2.0970207452882917e-06, - "loss": 0.1192, - "step": 900 - }, - { - "epoch": 5.52760736196319, - "grad_norm": 2.869401454925537, - "learning_rate": 2.0922661126453436e-06, - "loss": 0.0803, - "step": 901 - }, - { - "epoch": 5.533742331288344, - "grad_norm": 2.229947328567505, - "learning_rate": 2.0875129946137557e-06, - "loss": 0.0186, - "step": 902 - }, - { - "epoch": 5.539877300613497, - "grad_norm": 3.3460421562194824, - "learning_rate": 2.0827614088499624e-06, - "loss": 0.0499, - "step": 903 - }, - { - "epoch": 5.54601226993865, - "grad_norm": 1.9324007034301758, - "learning_rate": 2.0780113730047056e-06, - "loss": 0.0322, - "step": 904 - }, - { - "epoch": 5.552147239263804, - "grad_norm": 2.761482000350952, - "learning_rate": 2.0732629047229712e-06, - "loss": 0.0265, - "step": 905 - }, - { - "epoch": 5.558282208588957, - "grad_norm": 2.4173266887664795, - "learning_rate": 2.0685160216439205e-06, - "loss": 0.0229, - "step": 906 - }, - { - "epoch": 5.564417177914111, - "grad_norm": 2.503661632537842, - "learning_rate": 2.0637707414008267e-06, - "loss": 0.0266, - "step": 907 - }, - { - "epoch": 5.570552147239264, - "grad_norm": 2.312236785888672, - "learning_rate": 2.0590270816210077e-06, - "loss": 0.018, - "step": 908 - }, - { - "epoch": 5.576687116564417, - "grad_norm": 2.569575548171997, - "learning_rate": 2.0542850599257647e-06, - "loss": 0.0377, - "step": 909 - }, - { - "epoch": 5.58282208588957, - "grad_norm": 3.520341157913208, - "learning_rate": 2.0495446939303122e-06, - "loss": 0.1224, - "step": 910 - }, - { - "epoch": 5.588957055214724, - "grad_norm": 3.231363296508789, - "learning_rate": 2.044806001243714e-06, - "loss": 0.1457, - "step": 911 - }, - { - "epoch": 5.595092024539877, - "grad_norm": 3.3211300373077393, - "learning_rate": 2.040068999468818e-06, - "loss": 0.0429, - "step": 912 - }, - { - "epoch": 5.601226993865031, - "grad_norm": 3.3712961673736572, - "learning_rate": 2.035333706202192e-06, - "loss": 0.0634, - "step": 913 - }, - { - "epoch": 5.6073619631901845, - "grad_norm": 2.480177402496338, - "learning_rate": 2.0306001390340565e-06, - "loss": 0.0178, - "step": 914 - }, - { - "epoch": 5.613496932515337, - "grad_norm": 2.9777421951293945, - "learning_rate": 2.02586831554822e-06, - "loss": 0.037, - "step": 915 - }, - { - "epoch": 5.61963190184049, - "grad_norm": 2.9129085540771484, - "learning_rate": 2.021138253322012e-06, - "loss": 0.125, - "step": 916 - }, - { - "epoch": 5.625766871165644, - "grad_norm": 4.041767597198486, - "learning_rate": 2.016409969926224e-06, - "loss": 0.1897, - "step": 917 - }, - { - "epoch": 5.631901840490798, - "grad_norm": 4.088902950286865, - "learning_rate": 2.0116834829250355e-06, - "loss": 0.0546, - "step": 918 - }, - { - "epoch": 5.638036809815951, - "grad_norm": 3.8629167079925537, - "learning_rate": 2.0069588098759545e-06, - "loss": 0.0911, - "step": 919 - }, - { - "epoch": 5.644171779141105, - "grad_norm": 2.616830825805664, - "learning_rate": 2.00223596832975e-06, - "loss": 0.0527, - "step": 920 - }, - { - "epoch": 5.6503067484662575, - "grad_norm": 1.9370782375335693, - "learning_rate": 1.9975149758303885e-06, - "loss": 0.0384, - "step": 921 - }, - { - "epoch": 5.656441717791411, - "grad_norm": 3.7839455604553223, - "learning_rate": 1.992795849914967e-06, - "loss": 0.1033, - "step": 922 - }, - { - "epoch": 5.662576687116564, - "grad_norm": 3.870729923248291, - "learning_rate": 1.9880786081136498e-06, - "loss": 0.08, - "step": 923 - }, - { - "epoch": 5.668711656441718, - "grad_norm": 3.4394288063049316, - "learning_rate": 1.9833632679496008e-06, - "loss": 0.0819, - "step": 924 - }, - { - "epoch": 5.674846625766871, - "grad_norm": 3.1659159660339355, - "learning_rate": 1.97864984693892e-06, - "loss": 0.117, - "step": 925 - }, - { - "epoch": 5.680981595092025, - "grad_norm": 2.2375190258026123, - "learning_rate": 1.97393836259058e-06, - "loss": 0.0215, - "step": 926 - }, - { - "epoch": 5.6871165644171775, - "grad_norm": 3.9375314712524414, - "learning_rate": 1.969228832406358e-06, - "loss": 0.1422, - "step": 927 - }, - { - "epoch": 5.693251533742331, - "grad_norm": 3.1969058513641357, - "learning_rate": 1.964521273880772e-06, - "loss": 0.0538, - "step": 928 - }, - { - "epoch": 5.699386503067485, - "grad_norm": 3.5990066528320312, - "learning_rate": 1.9598157045010162e-06, - "loss": 0.114, - "step": 929 - }, - { - "epoch": 5.705521472392638, - "grad_norm": 3.1764235496520996, - "learning_rate": 1.9551121417468955e-06, - "loss": 0.053, - "step": 930 - }, - { - "epoch": 5.711656441717792, - "grad_norm": 4.1162309646606445, - "learning_rate": 1.9504106030907605e-06, - "loss": 0.0866, - "step": 931 - }, - { - "epoch": 5.717791411042945, - "grad_norm": 3.543071985244751, - "learning_rate": 1.945711105997444e-06, - "loss": 0.0908, - "step": 932 - }, - { - "epoch": 5.723926380368098, - "grad_norm": 4.136870384216309, - "learning_rate": 1.941013667924194e-06, - "loss": 0.0612, - "step": 933 - }, - { - "epoch": 5.730061349693251, - "grad_norm": 1.7658357620239258, - "learning_rate": 1.9363183063206097e-06, - "loss": 0.0283, - "step": 934 - }, - { - "epoch": 5.736196319018405, - "grad_norm": 3.9701411724090576, - "learning_rate": 1.931625038628577e-06, - "loss": 0.0948, - "step": 935 - }, - { - "epoch": 5.742331288343558, - "grad_norm": 3.0636157989501953, - "learning_rate": 1.9269338822822047e-06, - "loss": 0.0769, - "step": 936 - }, - { - "epoch": 5.748466257668712, - "grad_norm": 3.3671388626098633, - "learning_rate": 1.9222448547077573e-06, - "loss": 0.098, - "step": 937 - }, - { - "epoch": 5.754601226993865, - "grad_norm": 3.0725975036621094, - "learning_rate": 1.917557973323591e-06, - "loss": 0.0363, - "step": 938 - }, - { - "epoch": 5.7607361963190185, - "grad_norm": 2.5592041015625, - "learning_rate": 1.9128732555400915e-06, - "loss": 0.0205, - "step": 939 - }, - { - "epoch": 5.766871165644172, - "grad_norm": 2.835740804672241, - "learning_rate": 1.9081907187596054e-06, - "loss": 0.0548, - "step": 940 - }, - { - "epoch": 5.773006134969325, - "grad_norm": 3.3596746921539307, - "learning_rate": 1.9035103803763793e-06, - "loss": 0.0454, - "step": 941 - }, - { - "epoch": 5.779141104294479, - "grad_norm": 3.226579427719116, - "learning_rate": 1.8988322577764918e-06, - "loss": 0.0514, - "step": 942 - }, - { - "epoch": 5.785276073619632, - "grad_norm": 3.2044687271118164, - "learning_rate": 1.8941563683377905e-06, - "loss": 0.1361, - "step": 943 - }, - { - "epoch": 5.791411042944786, - "grad_norm": 1.8300527334213257, - "learning_rate": 1.8894827294298296e-06, - "loss": 0.0139, - "step": 944 - }, - { - "epoch": 5.7975460122699385, - "grad_norm": 2.503735303878784, - "learning_rate": 1.884811358413801e-06, - "loss": 0.0311, - "step": 945 - }, - { - "epoch": 5.803680981595092, - "grad_norm": 2.171309471130371, - "learning_rate": 1.8801422726424735e-06, - "loss": 0.0227, - "step": 946 - }, - { - "epoch": 5.809815950920245, - "grad_norm": 1.8116636276245117, - "learning_rate": 1.8754754894601252e-06, - "loss": 0.0157, - "step": 947 - }, - { - "epoch": 5.815950920245399, - "grad_norm": 3.1412570476531982, - "learning_rate": 1.870811026202482e-06, - "loss": 0.1093, - "step": 948 - }, - { - "epoch": 5.822085889570552, - "grad_norm": 2.3962290287017822, - "learning_rate": 1.8661489001966526e-06, - "loss": 0.021, - "step": 949 - }, - { - "epoch": 5.828220858895706, - "grad_norm": 4.169166564941406, - "learning_rate": 1.8614891287610621e-06, - "loss": 0.0663, - "step": 950 - }, - { - "epoch": 5.8343558282208585, - "grad_norm": 3.1181528568267822, - "learning_rate": 1.8568317292053894e-06, - "loss": 0.1008, - "step": 951 - }, - { - "epoch": 5.840490797546012, - "grad_norm": 3.5155029296875, - "learning_rate": 1.8521767188305023e-06, - "loss": 0.0451, - "step": 952 - }, - { - "epoch": 5.846625766871165, - "grad_norm": 2.975693702697754, - "learning_rate": 1.8475241149283957e-06, - "loss": 0.0561, - "step": 953 - }, - { - "epoch": 5.852760736196319, - "grad_norm": 2.1581289768218994, - "learning_rate": 1.842873934782122e-06, - "loss": 0.0265, - "step": 954 - }, - { - "epoch": 5.858895705521473, - "grad_norm": 2.6281228065490723, - "learning_rate": 1.8382261956657318e-06, - "loss": 0.1196, - "step": 955 - }, - { - "epoch": 5.865030674846626, - "grad_norm": 2.9569528102874756, - "learning_rate": 1.8335809148442074e-06, - "loss": 0.1356, - "step": 956 - }, - { - "epoch": 5.871165644171779, - "grad_norm": 2.450949192047119, - "learning_rate": 1.8289381095734005e-06, - "loss": 0.0444, - "step": 957 - }, - { - "epoch": 5.877300613496932, - "grad_norm": 2.1737027168273926, - "learning_rate": 1.8242977970999643e-06, - "loss": 0.0622, - "step": 958 - }, - { - "epoch": 5.883435582822086, - "grad_norm": 3.350647211074829, - "learning_rate": 1.8196599946612956e-06, - "loss": 0.0762, - "step": 959 - }, - { - "epoch": 5.889570552147239, - "grad_norm": 2.5031936168670654, - "learning_rate": 1.8150247194854642e-06, - "loss": 0.0207, - "step": 960 - }, - { - "epoch": 5.895705521472393, - "grad_norm": 3.7103707790374756, - "learning_rate": 1.8103919887911525e-06, - "loss": 0.1122, - "step": 961 - }, - { - "epoch": 5.901840490797546, - "grad_norm": 2.485322952270508, - "learning_rate": 1.8057618197875914e-06, - "loss": 0.0284, - "step": 962 - }, - { - "epoch": 5.9079754601226995, - "grad_norm": 1.903212547302246, - "learning_rate": 1.8011342296744961e-06, - "loss": 0.0239, - "step": 963 - }, - { - "epoch": 5.914110429447852, - "grad_norm": 3.015552520751953, - "learning_rate": 1.796509235642001e-06, - "loss": 0.0425, - "step": 964 - }, - { - "epoch": 5.920245398773006, - "grad_norm": 4.806198596954346, - "learning_rate": 1.7918868548705982e-06, - "loss": 0.2094, - "step": 965 - }, - { - "epoch": 5.92638036809816, - "grad_norm": 2.949596643447876, - "learning_rate": 1.7872671045310703e-06, - "loss": 0.0632, - "step": 966 - }, - { - "epoch": 5.932515337423313, - "grad_norm": 4.153099536895752, - "learning_rate": 1.782650001784431e-06, - "loss": 0.1411, - "step": 967 - }, - { - "epoch": 5.938650306748467, - "grad_norm": 3.4117565155029297, - "learning_rate": 1.7780355637818568e-06, - "loss": 0.0965, - "step": 968 - }, - { - "epoch": 5.9447852760736195, - "grad_norm": 2.533405303955078, - "learning_rate": 1.7734238076646277e-06, - "loss": 0.0568, - "step": 969 - }, - { - "epoch": 5.950920245398773, - "grad_norm": 2.3604726791381836, - "learning_rate": 1.7688147505640581e-06, - "loss": 0.0182, - "step": 970 - }, - { - "epoch": 5.957055214723926, - "grad_norm": 3.807424306869507, - "learning_rate": 1.7642084096014405e-06, - "loss": 0.0547, - "step": 971 - }, - { - "epoch": 5.96319018404908, - "grad_norm": 2.5735342502593994, - "learning_rate": 1.759604801887974e-06, - "loss": 0.0775, - "step": 972 - }, - { - "epoch": 5.969325153374233, - "grad_norm": 2.9217734336853027, - "learning_rate": 1.7550039445247069e-06, - "loss": 0.0541, - "step": 973 - }, - { - "epoch": 5.975460122699387, - "grad_norm": 2.793104410171509, - "learning_rate": 1.7504058546024694e-06, - "loss": 0.0257, - "step": 974 - }, - { - "epoch": 5.9815950920245395, - "grad_norm": 3.5610134601593018, - "learning_rate": 1.7458105492018114e-06, - "loss": 0.0767, - "step": 975 - }, - { - "epoch": 5.987730061349693, - "grad_norm": 2.0738015174865723, - "learning_rate": 1.7412180453929412e-06, - "loss": 0.025, - "step": 976 - }, - { - "epoch": 5.993865030674847, - "grad_norm": 2.1248421669006348, - "learning_rate": 1.736628360235657e-06, - "loss": 0.0183, - "step": 977 - }, - { - "epoch": 6.0, - "grad_norm": 2.901273727416992, - "learning_rate": 1.7320415107792893e-06, - "loss": 0.1369, - "step": 978 - }, - { - "epoch": 6.006134969325154, - "grad_norm": 3.815110683441162, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.1011, - "step": 979 - }, - { - "epoch": 6.012269938650307, - "grad_norm": 2.421208381652832, - "learning_rate": 1.7228763871138845e-06, - "loss": 0.0105, - "step": 980 - }, - { - "epoch": 6.0184049079754605, - "grad_norm": 2.7103846073150635, - "learning_rate": 1.718298146950585e-06, - "loss": 0.0373, - "step": 981 - }, - { - "epoch": 6.024539877300613, - "grad_norm": 1.3751411437988281, - "learning_rate": 1.7137228105795473e-06, - "loss": 0.0072, - "step": 982 - }, - { - "epoch": 6.030674846625767, - "grad_norm": 1.5235071182250977, - "learning_rate": 1.7091503949967987e-06, - "loss": 0.0126, - "step": 983 - }, - { - "epoch": 6.03680981595092, - "grad_norm": 2.0652546882629395, - "learning_rate": 1.7045809171875183e-06, - "loss": 0.0198, - "step": 984 - }, - { - "epoch": 6.042944785276074, - "grad_norm": 2.010207176208496, - "learning_rate": 1.70001439412597e-06, - "loss": 0.0186, - "step": 985 - }, - { - "epoch": 6.049079754601227, - "grad_norm": 2.0444021224975586, - "learning_rate": 1.6954508427754435e-06, - "loss": 0.0197, - "step": 986 - }, - { - "epoch": 6.0552147239263805, - "grad_norm": 2.6540091037750244, - "learning_rate": 1.690890280088187e-06, - "loss": 0.0192, - "step": 987 - }, - { - "epoch": 6.061349693251533, - "grad_norm": 1.6479653120040894, - "learning_rate": 1.6863327230053506e-06, - "loss": 0.0105, - "step": 988 - }, - { - "epoch": 6.067484662576687, - "grad_norm": 2.4434754848480225, - "learning_rate": 1.6817781884569146e-06, - "loss": 0.0275, - "step": 989 - }, - { - "epoch": 6.07361963190184, - "grad_norm": 1.7472137212753296, - "learning_rate": 1.677226693361636e-06, - "loss": 0.0095, - "step": 990 - }, - { - "epoch": 6.079754601226994, - "grad_norm": 2.952821969985962, - "learning_rate": 1.6726782546269793e-06, - "loss": 0.0483, - "step": 991 - }, - { - "epoch": 6.085889570552148, - "grad_norm": 3.123959541320801, - "learning_rate": 1.6681328891490544e-06, - "loss": 0.0815, - "step": 992 - }, - { - "epoch": 6.0920245398773005, - "grad_norm": 2.9924800395965576, - "learning_rate": 1.663590613812556e-06, - "loss": 0.0216, - "step": 993 - }, - { - "epoch": 6.098159509202454, - "grad_norm": 2.417778730392456, - "learning_rate": 1.6590514454907007e-06, - "loss": 0.0243, - "step": 994 - }, - { - "epoch": 6.104294478527607, - "grad_norm": 2.0682942867279053, - "learning_rate": 1.6545154010451613e-06, - "loss": 0.0669, - "step": 995 - }, - { - "epoch": 6.110429447852761, - "grad_norm": 2.9801135063171387, - "learning_rate": 1.6499824973260086e-06, - "loss": 0.0309, - "step": 996 - }, - { - "epoch": 6.116564417177914, - "grad_norm": 1.5753487348556519, - "learning_rate": 1.645452751171645e-06, - "loss": 0.026, - "step": 997 - }, - { - "epoch": 6.122699386503068, - "grad_norm": 2.461124897003174, - "learning_rate": 1.6409261794087438e-06, - "loss": 0.0191, - "step": 998 - }, - { - "epoch": 6.128834355828221, - "grad_norm": 3.839308261871338, - "learning_rate": 1.6364027988521875e-06, - "loss": 0.045, - "step": 999 - }, - { - "epoch": 6.134969325153374, - "grad_norm": 2.9653189182281494, - "learning_rate": 1.6318826263050022e-06, - "loss": 0.0197, - "step": 1000 - }, - { - "epoch": 6.141104294478527, - "grad_norm": 1.1804074048995972, - "learning_rate": 1.6273656785582986e-06, - "loss": 0.0092, - "step": 1001 - }, - { - "epoch": 6.147239263803681, - "grad_norm": 1.9027175903320312, - "learning_rate": 1.6228519723912073e-06, - "loss": 0.0141, - "step": 1002 - }, - { - "epoch": 6.153374233128835, - "grad_norm": 1.831039309501648, - "learning_rate": 1.618341524570819e-06, - "loss": 0.0131, - "step": 1003 - }, - { - "epoch": 6.159509202453988, - "grad_norm": 2.547327756881714, - "learning_rate": 1.613834351852119e-06, - "loss": 0.0686, - "step": 1004 - }, - { - "epoch": 6.1656441717791415, - "grad_norm": 2.746947765350342, - "learning_rate": 1.6093304709779273e-06, - "loss": 0.036, - "step": 1005 - }, - { - "epoch": 6.171779141104294, - "grad_norm": 2.0104732513427734, - "learning_rate": 1.6048298986788345e-06, - "loss": 0.0216, - "step": 1006 - }, - { - "epoch": 6.177914110429448, - "grad_norm": 2.655977725982666, - "learning_rate": 1.6003326516731431e-06, - "loss": 0.024, - "step": 1007 - }, - { - "epoch": 6.184049079754601, - "grad_norm": 2.0733132362365723, - "learning_rate": 1.5958387466668015e-06, - "loss": 0.0133, - "step": 1008 - }, - { - "epoch": 6.190184049079755, - "grad_norm": 2.5398054122924805, - "learning_rate": 1.5913482003533437e-06, - "loss": 0.0331, - "step": 1009 - }, - { - "epoch": 6.196319018404908, - "grad_norm": 1.7983721494674683, - "learning_rate": 1.5868610294138264e-06, - "loss": 0.0111, - "step": 1010 - }, - { - "epoch": 6.2024539877300615, - "grad_norm": 1.7259647846221924, - "learning_rate": 1.58237725051677e-06, - "loss": 0.0112, - "step": 1011 - }, - { - "epoch": 6.208588957055214, - "grad_norm": 1.7722725868225098, - "learning_rate": 1.577896880318093e-06, - "loss": 0.0181, - "step": 1012 - }, - { - "epoch": 6.214723926380368, - "grad_norm": 3.633545398712158, - "learning_rate": 1.5734199354610513e-06, - "loss": 0.0135, - "step": 1013 - }, - { - "epoch": 6.220858895705521, - "grad_norm": 1.8951494693756104, - "learning_rate": 1.5689464325761764e-06, - "loss": 0.0163, - "step": 1014 - }, - { - "epoch": 6.226993865030675, - "grad_norm": 1.637170433998108, - "learning_rate": 1.564476388281216e-06, - "loss": 0.0068, - "step": 1015 - }, - { - "epoch": 6.233128834355828, - "grad_norm": 2.2963850498199463, - "learning_rate": 1.5600098191810682e-06, - "loss": 0.021, - "step": 1016 - }, - { - "epoch": 6.2392638036809815, - "grad_norm": 2.777996063232422, - "learning_rate": 1.555546741867722e-06, - "loss": 0.0349, - "step": 1017 - }, - { - "epoch": 6.245398773006135, - "grad_norm": 2.1580724716186523, - "learning_rate": 1.5510871729201953e-06, - "loss": 0.0626, - "step": 1018 - }, - { - "epoch": 6.251533742331288, - "grad_norm": 1.4158363342285156, - "learning_rate": 1.5466311289044755e-06, - "loss": 0.0082, - "step": 1019 - }, - { - "epoch": 6.257668711656442, - "grad_norm": 3.287564516067505, - "learning_rate": 1.5421786263734524e-06, - "loss": 0.0212, - "step": 1020 - }, - { - "epoch": 6.263803680981595, - "grad_norm": 2.4552016258239746, - "learning_rate": 1.5377296818668638e-06, - "loss": 0.0963, - "step": 1021 - }, - { - "epoch": 6.269938650306749, - "grad_norm": 1.877556562423706, - "learning_rate": 1.5332843119112285e-06, - "loss": 0.011, - "step": 1022 - }, - { - "epoch": 6.276073619631902, - "grad_norm": 3.720372438430786, - "learning_rate": 1.5288425330197864e-06, - "loss": 0.018, - "step": 1023 - }, - { - "epoch": 6.282208588957055, - "grad_norm": 1.9751925468444824, - "learning_rate": 1.5244043616924389e-06, - "loss": 0.0162, - "step": 1024 - }, - { - "epoch": 6.288343558282208, - "grad_norm": 2.5137453079223633, - "learning_rate": 1.5199698144156865e-06, - "loss": 0.0468, - "step": 1025 - }, - { - "epoch": 6.294478527607362, - "grad_norm": 2.111983299255371, - "learning_rate": 1.5155389076625663e-06, - "loss": 0.0064, - "step": 1026 - }, - { - "epoch": 6.300613496932515, - "grad_norm": 2.572223663330078, - "learning_rate": 1.5111116578925924e-06, - "loss": 0.035, - "step": 1027 - }, - { - "epoch": 6.306748466257669, - "grad_norm": 2.7881019115448, - "learning_rate": 1.5066880815516943e-06, - "loss": 0.0197, - "step": 1028 - }, - { - "epoch": 6.3128834355828225, - "grad_norm": 1.2287017107009888, - "learning_rate": 1.5022681950721565e-06, - "loss": 0.0059, - "step": 1029 - }, - { - "epoch": 6.319018404907975, - "grad_norm": 1.764028549194336, - "learning_rate": 1.4978520148725558e-06, - "loss": 0.006, - "step": 1030 - }, - { - "epoch": 6.325153374233129, - "grad_norm": 2.399787664413452, - "learning_rate": 1.4934395573577016e-06, - "loss": 0.0126, - "step": 1031 - }, - { - "epoch": 6.331288343558282, - "grad_norm": 1.9056172370910645, - "learning_rate": 1.4890308389185743e-06, - "loss": 0.0131, - "step": 1032 - }, - { - "epoch": 6.337423312883436, - "grad_norm": 1.7394744157791138, - "learning_rate": 1.484625875932265e-06, - "loss": 0.016, - "step": 1033 - }, - { - "epoch": 6.343558282208589, - "grad_norm": 4.352719306945801, - "learning_rate": 1.480224684761915e-06, - "loss": 0.1059, - "step": 1034 - }, - { - "epoch": 6.3496932515337425, - "grad_norm": 2.148385524749756, - "learning_rate": 1.4758272817566538e-06, - "loss": 0.0312, - "step": 1035 - }, - { - "epoch": 6.355828220858895, - "grad_norm": 2.483872175216675, - "learning_rate": 1.4714336832515386e-06, - "loss": 0.0215, - "step": 1036 - }, - { - "epoch": 6.361963190184049, - "grad_norm": 2.6151270866394043, - "learning_rate": 1.467043905567494e-06, - "loss": 0.0718, - "step": 1037 - }, - { - "epoch": 6.368098159509202, - "grad_norm": 2.554600954055786, - "learning_rate": 1.4626579650112533e-06, - "loss": 0.0166, - "step": 1038 - }, - { - "epoch": 6.374233128834356, - "grad_norm": 3.013974905014038, - "learning_rate": 1.4582758778752926e-06, - "loss": 0.0448, - "step": 1039 - }, - { - "epoch": 6.38036809815951, - "grad_norm": 2.1542789936065674, - "learning_rate": 1.4538976604377781e-06, - "loss": 0.0297, - "step": 1040 - }, - { - "epoch": 6.386503067484663, - "grad_norm": 3.4402377605438232, - "learning_rate": 1.449523328962496e-06, - "loss": 0.0409, - "step": 1041 - }, - { - "epoch": 6.392638036809816, - "grad_norm": 1.6200538873672485, - "learning_rate": 1.4451528996988018e-06, - "loss": 0.0127, - "step": 1042 - }, - { - "epoch": 6.398773006134969, - "grad_norm": 3.081733465194702, - "learning_rate": 1.4407863888815527e-06, - "loss": 0.0788, - "step": 1043 - }, - { - "epoch": 6.404907975460123, - "grad_norm": 1.9813143014907837, - "learning_rate": 1.436423812731051e-06, - "loss": 0.0082, - "step": 1044 - }, - { - "epoch": 6.411042944785276, - "grad_norm": 1.7354048490524292, - "learning_rate": 1.432065187452984e-06, - "loss": 0.0086, - "step": 1045 - }, - { - "epoch": 6.41717791411043, - "grad_norm": 1.8812576532363892, - "learning_rate": 1.4277105292383594e-06, - "loss": 0.04, - "step": 1046 - }, - { - "epoch": 6.423312883435583, - "grad_norm": 1.117837905883789, - "learning_rate": 1.4233598542634519e-06, - "loss": 0.0054, - "step": 1047 - }, - { - "epoch": 6.429447852760736, - "grad_norm": 1.9587867259979248, - "learning_rate": 1.4190131786897388e-06, - "loss": 0.0263, - "step": 1048 - }, - { - "epoch": 6.435582822085889, - "grad_norm": 1.2712376117706299, - "learning_rate": 1.4146705186638388e-06, - "loss": 0.0098, - "step": 1049 - }, - { - "epoch": 6.441717791411043, - "grad_norm": 2.6563849449157715, - "learning_rate": 1.410331890317457e-06, - "loss": 0.0322, - "step": 1050 - }, - { - "epoch": 6.447852760736196, - "grad_norm": 3.136518955230713, - "learning_rate": 1.4059973097673187e-06, - "loss": 0.0729, - "step": 1051 - }, - { - "epoch": 6.45398773006135, - "grad_norm": 1.3937572240829468, - "learning_rate": 1.4016667931151156e-06, - "loss": 0.0094, - "step": 1052 - }, - { - "epoch": 6.460122699386503, - "grad_norm": 1.7218928337097168, - "learning_rate": 1.3973403564474422e-06, - "loss": 0.0078, - "step": 1053 - }, - { - "epoch": 6.466257668711656, - "grad_norm": 2.35612416267395, - "learning_rate": 1.393018015835737e-06, - "loss": 0.0231, - "step": 1054 - }, - { - "epoch": 6.47239263803681, - "grad_norm": 1.96125066280365, - "learning_rate": 1.388699787336224e-06, - "loss": 0.0153, - "step": 1055 - }, - { - "epoch": 6.478527607361963, - "grad_norm": 2.1789233684539795, - "learning_rate": 1.3843856869898486e-06, - "loss": 0.0136, - "step": 1056 - }, - { - "epoch": 6.484662576687117, - "grad_norm": 3.1261701583862305, - "learning_rate": 1.3800757308222263e-06, - "loss": 0.0819, - "step": 1057 - }, - { - "epoch": 6.49079754601227, - "grad_norm": 2.93422794342041, - "learning_rate": 1.3757699348435726e-06, - "loss": 0.0658, - "step": 1058 - }, - { - "epoch": 6.4969325153374236, - "grad_norm": 2.1311776638031006, - "learning_rate": 1.3714683150486534e-06, - "loss": 0.0106, - "step": 1059 - }, - { - "epoch": 6.5030674846625764, - "grad_norm": 1.699877381324768, - "learning_rate": 1.3671708874167211e-06, - "loss": 0.0151, - "step": 1060 - }, - { - "epoch": 6.50920245398773, - "grad_norm": 1.7288825511932373, - "learning_rate": 1.3628776679114516e-06, - "loss": 0.0114, - "step": 1061 - }, - { - "epoch": 6.515337423312883, - "grad_norm": 1.8437966108322144, - "learning_rate": 1.3585886724808934e-06, - "loss": 0.0117, - "step": 1062 - }, - { - "epoch": 6.521472392638037, - "grad_norm": 3.073568344116211, - "learning_rate": 1.3543039170574022e-06, - "loss": 0.0381, - "step": 1063 - }, - { - "epoch": 6.52760736196319, - "grad_norm": 1.6069157123565674, - "learning_rate": 1.350023417557581e-06, - "loss": 0.0072, - "step": 1064 - }, - { - "epoch": 6.533742331288344, - "grad_norm": 2.48502779006958, - "learning_rate": 1.345747189882228e-06, - "loss": 0.0302, - "step": 1065 - }, - { - "epoch": 6.539877300613497, - "grad_norm": 1.6879143714904785, - "learning_rate": 1.3414752499162676e-06, - "loss": 0.0095, - "step": 1066 - }, - { - "epoch": 6.54601226993865, - "grad_norm": 2.2126848697662354, - "learning_rate": 1.3372076135287005e-06, - "loss": 0.067, - "step": 1067 - }, - { - "epoch": 6.552147239263804, - "grad_norm": 2.157269239425659, - "learning_rate": 1.33294429657254e-06, - "loss": 0.0203, - "step": 1068 - }, - { - "epoch": 6.558282208588957, - "grad_norm": 2.725158452987671, - "learning_rate": 1.3286853148847523e-06, - "loss": 0.0217, - "step": 1069 - }, - { - "epoch": 6.564417177914111, - "grad_norm": 2.478426456451416, - "learning_rate": 1.3244306842862007e-06, - "loss": 0.0223, - "step": 1070 - }, - { - "epoch": 6.570552147239264, - "grad_norm": 2.349463939666748, - "learning_rate": 1.3201804205815872e-06, - "loss": 0.027, - "step": 1071 - }, - { - "epoch": 6.576687116564417, - "grad_norm": 2.049593210220337, - "learning_rate": 1.3159345395593876e-06, - "loss": 0.0212, - "step": 1072 - }, - { - "epoch": 6.58282208588957, - "grad_norm": 2.3445141315460205, - "learning_rate": 1.3116930569918024e-06, - "loss": 0.0182, - "step": 1073 - }, - { - "epoch": 6.588957055214724, - "grad_norm": 3.756135940551758, - "learning_rate": 1.3074559886346886e-06, - "loss": 0.1187, - "step": 1074 - }, - { - "epoch": 6.595092024539877, - "grad_norm": 2.4747114181518555, - "learning_rate": 1.3032233502275089e-06, - "loss": 0.0103, - "step": 1075 - }, - { - "epoch": 6.601226993865031, - "grad_norm": 2.0029311180114746, - "learning_rate": 1.2989951574932693e-06, - "loss": 0.0115, - "step": 1076 - }, - { - "epoch": 6.6073619631901845, - "grad_norm": 2.007141351699829, - "learning_rate": 1.2947714261384602e-06, - "loss": 0.0155, - "step": 1077 - }, - { - "epoch": 6.613496932515337, - "grad_norm": 1.5075048208236694, - "learning_rate": 1.2905521718530012e-06, - "loss": 0.0125, - "step": 1078 - }, - { - "epoch": 6.61963190184049, - "grad_norm": 1.9235132932662964, - "learning_rate": 1.2863374103101784e-06, - "loss": 0.0181, - "step": 1079 - }, - { - "epoch": 6.625766871165644, - "grad_norm": 1.7235040664672852, - "learning_rate": 1.2821271571665912e-06, - "loss": 0.0102, - "step": 1080 - }, - { - "epoch": 6.631901840490798, - "grad_norm": 3.503974676132202, - "learning_rate": 1.277921428062091e-06, - "loss": 0.0969, - "step": 1081 - }, - { - "epoch": 6.638036809815951, - "grad_norm": 2.4633288383483887, - "learning_rate": 1.2737202386197222e-06, - "loss": 0.0383, - "step": 1082 - }, - { - "epoch": 6.644171779141105, - "grad_norm": 2.332341432571411, - "learning_rate": 1.2695236044456672e-06, - "loss": 0.0184, - "step": 1083 - }, - { - "epoch": 6.6503067484662575, - "grad_norm": 2.8279805183410645, - "learning_rate": 1.2653315411291867e-06, - "loss": 0.0327, - "step": 1084 - }, - { - "epoch": 6.656441717791411, - "grad_norm": 2.444810628890991, - "learning_rate": 1.2611440642425617e-06, - "loss": 0.0399, - "step": 1085 - }, - { - "epoch": 6.662576687116564, - "grad_norm": 2.9304957389831543, - "learning_rate": 1.2569611893410374e-06, - "loss": 0.0385, - "step": 1086 - }, - { - "epoch": 6.668711656441718, - "grad_norm": 2.1244678497314453, - "learning_rate": 1.2527829319627604e-06, - "loss": 0.0123, - "step": 1087 - }, - { - "epoch": 6.674846625766871, - "grad_norm": 2.129033327102661, - "learning_rate": 1.248609307628729e-06, - "loss": 0.0302, - "step": 1088 - }, - { - "epoch": 6.680981595092025, - "grad_norm": 5.788925647735596, - "learning_rate": 1.2444403318427268e-06, - "loss": 0.0296, - "step": 1089 - }, - { - "epoch": 6.6871165644171775, - "grad_norm": 5.127935886383057, - "learning_rate": 1.2402760200912725e-06, - "loss": 0.1532, - "step": 1090 - }, - { - "epoch": 6.693251533742331, - "grad_norm": 2.2610318660736084, - "learning_rate": 1.2361163878435594e-06, - "loss": 0.0126, - "step": 1091 - }, - { - "epoch": 6.699386503067485, - "grad_norm": 1.7913328409194946, - "learning_rate": 1.2319614505513953e-06, - "loss": 0.0086, - "step": 1092 - }, - { - "epoch": 6.705521472392638, - "grad_norm": 1.5961267948150635, - "learning_rate": 1.227811223649149e-06, - "loss": 0.0041, - "step": 1093 - }, - { - "epoch": 6.711656441717792, - "grad_norm": 1.441754937171936, - "learning_rate": 1.2236657225536938e-06, - "loss": 0.0103, - "step": 1094 - }, - { - "epoch": 6.717791411042945, - "grad_norm": 1.4393174648284912, - "learning_rate": 1.2195249626643432e-06, - "loss": 0.0063, - "step": 1095 - }, - { - "epoch": 6.723926380368098, - "grad_norm": 3.199451208114624, - "learning_rate": 1.2153889593628032e-06, - "loss": 0.0571, - "step": 1096 - }, - { - "epoch": 6.730061349693251, - "grad_norm": 2.1796770095825195, - "learning_rate": 1.211257728013107e-06, - "loss": 0.0269, - "step": 1097 - }, - { - "epoch": 6.736196319018405, - "grad_norm": 3.1798806190490723, - "learning_rate": 1.2071312839615634e-06, - "loss": 0.0396, - "step": 1098 - }, - { - "epoch": 6.742331288343558, - "grad_norm": 3.063633680343628, - "learning_rate": 1.2030096425366985e-06, - "loss": 0.0261, - "step": 1099 - }, - { - "epoch": 6.748466257668712, - "grad_norm": 1.860409140586853, - "learning_rate": 1.1988928190491948e-06, - "loss": 0.013, - "step": 1100 - }, - { - "epoch": 6.754601226993865, - "grad_norm": 1.9303224086761475, - "learning_rate": 1.1947808287918406e-06, - "loss": 0.0113, - "step": 1101 - }, - { - "epoch": 6.7607361963190185, - "grad_norm": 2.1432337760925293, - "learning_rate": 1.19067368703947e-06, - "loss": 0.0195, - "step": 1102 - }, - { - "epoch": 6.766871165644172, - "grad_norm": 1.8998470306396484, - "learning_rate": 1.1865714090489038e-06, - "loss": 0.0105, - "step": 1103 - }, - { - "epoch": 6.773006134969325, - "grad_norm": 2.3260247707366943, - "learning_rate": 1.1824740100588991e-06, - "loss": 0.0554, - "step": 1104 - }, - { - "epoch": 6.779141104294479, - "grad_norm": 1.9272006750106812, - "learning_rate": 1.1783815052900848e-06, - "loss": 0.0118, - "step": 1105 - }, - { - "epoch": 6.785276073619632, - "grad_norm": 3.1646785736083984, - "learning_rate": 1.1742939099449126e-06, - "loss": 0.0901, - "step": 1106 - }, - { - "epoch": 6.791411042944786, - "grad_norm": 3.357422351837158, - "learning_rate": 1.1702112392075966e-06, - "loss": 0.0833, - "step": 1107 - }, - { - "epoch": 6.7975460122699385, - "grad_norm": 1.4302526712417603, - "learning_rate": 1.1661335082440545e-06, - "loss": 0.0078, - "step": 1108 - }, - { - "epoch": 6.803680981595092, - "grad_norm": 1.3046417236328125, - "learning_rate": 1.1620607322018587e-06, - "loss": 0.0092, - "step": 1109 - }, - { - "epoch": 6.809815950920245, - "grad_norm": 2.084237813949585, - "learning_rate": 1.1579929262101712e-06, - "loss": 0.0283, - "step": 1110 - }, - { - "epoch": 6.815950920245399, - "grad_norm": 1.9403250217437744, - "learning_rate": 1.153930105379695e-06, - "loss": 0.0066, - "step": 1111 - }, - { - "epoch": 6.822085889570552, - "grad_norm": 2.282449722290039, - "learning_rate": 1.1498722848026142e-06, - "loss": 0.0402, - "step": 1112 - }, - { - "epoch": 6.828220858895706, - "grad_norm": 1.9357627630233765, - "learning_rate": 1.1458194795525354e-06, - "loss": 0.0101, - "step": 1113 - }, - { - "epoch": 6.8343558282208585, - "grad_norm": 2.0236339569091797, - "learning_rate": 1.1417717046844385e-06, - "loss": 0.0109, - "step": 1114 - }, - { - "epoch": 6.840490797546012, - "grad_norm": 2.386857032775879, - "learning_rate": 1.137728975234615e-06, - "loss": 0.0297, - "step": 1115 - }, - { - "epoch": 6.846625766871165, - "grad_norm": 2.2477970123291016, - "learning_rate": 1.1336913062206157e-06, - "loss": 0.0393, - "step": 1116 - }, - { - "epoch": 6.852760736196319, - "grad_norm": 2.7217776775360107, - "learning_rate": 1.129658712641192e-06, - "loss": 0.0269, - "step": 1117 - }, - { - "epoch": 6.858895705521473, - "grad_norm": 2.6717259883880615, - "learning_rate": 1.125631209476241e-06, - "loss": 0.0708, - "step": 1118 - }, - { - "epoch": 6.865030674846626, - "grad_norm": 2.951939344406128, - "learning_rate": 1.1216088116867524e-06, - "loss": 0.0835, - "step": 1119 - }, - { - "epoch": 6.871165644171779, - "grad_norm": 1.9705166816711426, - "learning_rate": 1.1175915342147486e-06, - "loss": 0.0107, - "step": 1120 - }, - { - "epoch": 6.877300613496932, - "grad_norm": 2.4005937576293945, - "learning_rate": 1.1135793919832336e-06, - "loss": 0.0139, - "step": 1121 - }, - { - "epoch": 6.883435582822086, - "grad_norm": 2.277463674545288, - "learning_rate": 1.1095723998961353e-06, - "loss": 0.0154, - "step": 1122 - }, - { - "epoch": 6.889570552147239, - "grad_norm": 1.5026034116744995, - "learning_rate": 1.1055705728382482e-06, - "loss": 0.0072, - "step": 1123 - }, - { - "epoch": 6.895705521472393, - "grad_norm": 1.9540379047393799, - "learning_rate": 1.1015739256751826e-06, - "loss": 0.0202, - "step": 1124 - }, - { - "epoch": 6.901840490797546, - "grad_norm": 2.3090603351593018, - "learning_rate": 1.0975824732533066e-06, - "loss": 0.0559, - "step": 1125 - }, - { - "epoch": 6.9079754601226995, - "grad_norm": 2.100283622741699, - "learning_rate": 1.09359623039969e-06, - "loss": 0.0385, - "step": 1126 - }, - { - "epoch": 6.914110429447852, - "grad_norm": 2.4120566844940186, - "learning_rate": 1.0896152119220525e-06, - "loss": 0.0535, - "step": 1127 - }, - { - "epoch": 6.920245398773006, - "grad_norm": 2.003495454788208, - "learning_rate": 1.0856394326087045e-06, - "loss": 0.0104, - "step": 1128 - }, - { - "epoch": 6.92638036809816, - "grad_norm": 1.6565535068511963, - "learning_rate": 1.0816689072284962e-06, - "loss": 0.0121, - "step": 1129 - }, - { - "epoch": 6.932515337423313, - "grad_norm": 1.6503472328186035, - "learning_rate": 1.0777036505307616e-06, - "loss": 0.0056, - "step": 1130 - }, - { - "epoch": 6.938650306748467, - "grad_norm": 2.600112199783325, - "learning_rate": 1.0737436772452602e-06, - "loss": 0.0198, - "step": 1131 - }, - { - "epoch": 6.9447852760736195, - "grad_norm": 1.6668883562088013, - "learning_rate": 1.0697890020821292e-06, - "loss": 0.0077, - "step": 1132 - }, - { - "epoch": 6.950920245398773, - "grad_norm": 2.729172706604004, - "learning_rate": 1.0658396397318203e-06, - "loss": 0.0329, - "step": 1133 - }, - { - "epoch": 6.957055214723926, - "grad_norm": 1.5219136476516724, - "learning_rate": 1.061895604865053e-06, - "loss": 0.0113, - "step": 1134 - }, - { - "epoch": 6.96319018404908, - "grad_norm": 3.8395588397979736, - "learning_rate": 1.057956912132757e-06, - "loss": 0.0376, - "step": 1135 - }, - { - "epoch": 6.969325153374233, - "grad_norm": 2.4347221851348877, - "learning_rate": 1.054023576166014e-06, - "loss": 0.0517, - "step": 1136 - }, - { - "epoch": 6.975460122699387, - "grad_norm": 3.079165458679199, - "learning_rate": 1.0500956115760105e-06, - "loss": 0.0373, - "step": 1137 - }, - { - "epoch": 6.9815950920245395, - "grad_norm": 1.9391908645629883, - "learning_rate": 1.0461730329539794e-06, - "loss": 0.019, - "step": 1138 - }, - { - "epoch": 6.987730061349693, - "grad_norm": 1.8693119287490845, - "learning_rate": 1.0422558548711434e-06, - "loss": 0.0073, - "step": 1139 - }, - { - "epoch": 6.993865030674847, - "grad_norm": 3.0920307636260986, - "learning_rate": 1.0383440918786684e-06, - "loss": 0.0099, - "step": 1140 - }, - { - "epoch": 7.0, - "grad_norm": 3.184906244277954, - "learning_rate": 1.0344377585076e-06, - "loss": 0.0218, - "step": 1141 - }, - { - "epoch": 7.006134969325154, - "grad_norm": 0.7609673142433167, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.0024, - "step": 1142 - }, - { - "epoch": 7.012269938650307, - "grad_norm": 1.1493247747421265, - "learning_rate": 1.0266414386529775e-06, - "loss": 0.0059, - "step": 1143 - }, - { - "epoch": 7.0184049079754605, - "grad_norm": 3.534796953201294, - "learning_rate": 1.0227514811304556e-06, - "loss": 0.0843, - "step": 1144 - }, - { - "epoch": 7.024539877300613, - "grad_norm": 1.1876507997512817, - "learning_rate": 1.0188670111513002e-06, - "loss": 0.0098, - "step": 1145 - }, - { - "epoch": 7.030674846625767, - "grad_norm": 1.2825753688812256, - "learning_rate": 1.0149880431451736e-06, - "loss": 0.0042, - "step": 1146 - }, - { - "epoch": 7.03680981595092, - "grad_norm": 0.6842563152313232, - "learning_rate": 1.0111145915213e-06, - "loss": 0.003, - "step": 1147 - }, - { - "epoch": 7.042944785276074, - "grad_norm": 0.6310113072395325, - "learning_rate": 1.0072466706684127e-06, - "loss": 0.0027, - "step": 1148 - }, - { - "epoch": 7.049079754601227, - "grad_norm": 1.484761357307434, - "learning_rate": 1.0033842949546974e-06, - "loss": 0.0105, - "step": 1149 - }, - { - "epoch": 7.0552147239263805, - "grad_norm": 1.9790291786193848, - "learning_rate": 9.995274787277445e-07, - "loss": 0.0233, - "step": 1150 - }, - { - "epoch": 7.061349693251533, - "grad_norm": 1.1398522853851318, - "learning_rate": 9.956762363144892e-07, - "loss": 0.0031, - "step": 1151 - }, - { - "epoch": 7.067484662576687, - "grad_norm": 1.0574359893798828, - "learning_rate": 9.918305820211643e-07, - "loss": 0.0047, - "step": 1152 - }, - { - "epoch": 7.07361963190184, - "grad_norm": 2.463972330093384, - "learning_rate": 9.879905301332439e-07, - "loss": 0.0334, - "step": 1153 - }, - { - "epoch": 7.079754601226994, - "grad_norm": 1.4698575735092163, - "learning_rate": 9.84156094915389e-07, - "loss": 0.0191, - "step": 1154 - }, - { - "epoch": 7.085889570552148, - "grad_norm": 1.2635239362716675, - "learning_rate": 9.803272906113978e-07, - "loss": 0.0045, - "step": 1155 - }, - { - "epoch": 7.0920245398773005, - "grad_norm": 1.7271842956542969, - "learning_rate": 9.765041314441529e-07, - "loss": 0.0042, - "step": 1156 - }, - { - "epoch": 7.098159509202454, - "grad_norm": 1.5738918781280518, - "learning_rate": 9.72686631615563e-07, - "loss": 0.0066, - "step": 1157 - }, - { - "epoch": 7.104294478527607, - "grad_norm": 1.3097981214523315, - "learning_rate": 9.688748053065179e-07, - "loss": 0.0058, - "step": 1158 - }, - { - "epoch": 7.110429447852761, - "grad_norm": 2.076064348220825, - "learning_rate": 9.65068666676828e-07, - "loss": 0.0067, - "step": 1159 - }, - { - "epoch": 7.116564417177914, - "grad_norm": 1.1589064598083496, - "learning_rate": 9.612682298651792e-07, - "loss": 0.0052, - "step": 1160 - }, - { - "epoch": 7.122699386503068, - "grad_norm": 1.6450324058532715, - "learning_rate": 9.574735089890765e-07, - "loss": 0.0035, - "step": 1161 - }, - { - "epoch": 7.128834355828221, - "grad_norm": 1.6968387365341187, - "learning_rate": 9.53684518144789e-07, - "loss": 0.0126, - "step": 1162 - }, - { - "epoch": 7.134969325153374, - "grad_norm": 1.9047832489013672, - "learning_rate": 9.499012714073036e-07, - "loss": 0.0345, - "step": 1163 - }, - { - "epoch": 7.141104294478527, - "grad_norm": 1.7587796449661255, - "learning_rate": 9.461237828302666e-07, - "loss": 0.0144, - "step": 1164 - }, - { - "epoch": 7.147239263803681, - "grad_norm": 1.863775372505188, - "learning_rate": 9.423520664459374e-07, - "loss": 0.0135, - "step": 1165 - }, - { - "epoch": 7.153374233128835, - "grad_norm": 2.6580259799957275, - "learning_rate": 9.385861362651322e-07, - "loss": 0.0138, - "step": 1166 - }, - { - "epoch": 7.159509202453988, - "grad_norm": 2.086371421813965, - "learning_rate": 9.348260062771713e-07, - "loss": 0.0093, - "step": 1167 - }, - { - "epoch": 7.1656441717791415, - "grad_norm": 1.0806611776351929, - "learning_rate": 9.310716904498321e-07, - "loss": 0.003, - "step": 1168 - }, - { - "epoch": 7.171779141104294, - "grad_norm": 1.2487165927886963, - "learning_rate": 9.273232027292933e-07, - "loss": 0.0033, - "step": 1169 - }, - { - "epoch": 7.177914110429448, - "grad_norm": 1.0647703409194946, - "learning_rate": 9.235805570400813e-07, - "loss": 0.0024, - "step": 1170 - }, - { - "epoch": 7.184049079754601, - "grad_norm": 1.6039917469024658, - "learning_rate": 9.198437672850249e-07, - "loss": 0.0118, - "step": 1171 - }, - { - "epoch": 7.190184049079755, - "grad_norm": 2.199977159500122, - "learning_rate": 9.161128473451967e-07, - "loss": 0.0173, - "step": 1172 - }, - { - "epoch": 7.196319018404908, - "grad_norm": 2.51725697517395, - "learning_rate": 9.123878110798662e-07, - "loss": 0.0142, - "step": 1173 - }, - { - "epoch": 7.2024539877300615, - "grad_norm": 1.841742753982544, - "learning_rate": 9.086686723264474e-07, - "loss": 0.012, - "step": 1174 - }, - { - "epoch": 7.208588957055214, - "grad_norm": 1.212876319885254, - "learning_rate": 9.049554449004447e-07, - "loss": 0.0055, - "step": 1175 - }, - { - "epoch": 7.214723926380368, - "grad_norm": 1.3728275299072266, - "learning_rate": 9.012481425954053e-07, - "loss": 0.0043, - "step": 1176 - }, - { - "epoch": 7.220858895705521, - "grad_norm": 2.3055357933044434, - "learning_rate": 8.97546779182866e-07, - "loss": 0.0443, - "step": 1177 - }, - { - "epoch": 7.226993865030675, - "grad_norm": 2.017620801925659, - "learning_rate": 8.938513684123024e-07, - "loss": 0.0082, - "step": 1178 - }, - { - "epoch": 7.233128834355828, - "grad_norm": 1.5641282796859741, - "learning_rate": 8.901619240110781e-07, - "loss": 0.0071, - "step": 1179 - }, - { - "epoch": 7.2392638036809815, - "grad_norm": 1.3781960010528564, - "learning_rate": 8.864784596843917e-07, - "loss": 0.0056, - "step": 1180 - }, - { - "epoch": 7.245398773006135, - "grad_norm": 1.23178231716156, - "learning_rate": 8.828009891152301e-07, - "loss": 0.0076, - "step": 1181 - }, - { - "epoch": 7.251533742331288, - "grad_norm": 2.809582233428955, - "learning_rate": 8.791295259643126e-07, - "loss": 0.0141, - "step": 1182 - }, - { - "epoch": 7.257668711656442, - "grad_norm": 1.6520317792892456, - "learning_rate": 8.754640838700443e-07, - "loss": 0.01, - "step": 1183 - }, - { - "epoch": 7.263803680981595, - "grad_norm": 1.411852478981018, - "learning_rate": 8.718046764484648e-07, - "loss": 0.009, - "step": 1184 - }, - { - "epoch": 7.269938650306749, - "grad_norm": 2.9334425926208496, - "learning_rate": 8.681513172931935e-07, - "loss": 0.0291, - "step": 1185 - }, - { - "epoch": 7.276073619631902, - "grad_norm": 1.4273028373718262, - "learning_rate": 8.64504019975386e-07, - "loss": 0.0064, - "step": 1186 - }, - { - "epoch": 7.282208588957055, - "grad_norm": 1.9486448764801025, - "learning_rate": 8.608627980436765e-07, - "loss": 0.0135, - "step": 1187 - }, - { - "epoch": 7.288343558282208, - "grad_norm": 1.3740493059158325, - "learning_rate": 8.572276650241329e-07, - "loss": 0.0061, - "step": 1188 - }, - { - "epoch": 7.294478527607362, - "grad_norm": 1.3352797031402588, - "learning_rate": 8.535986344202057e-07, - "loss": 0.0051, - "step": 1189 - }, - { - "epoch": 7.300613496932515, - "grad_norm": 1.0336774587631226, - "learning_rate": 8.499757197126732e-07, - "loss": 0.0052, - "step": 1190 - }, - { - "epoch": 7.306748466257669, - "grad_norm": 1.1450837850570679, - "learning_rate": 8.463589343595976e-07, - "loss": 0.0111, - "step": 1191 - }, - { - "epoch": 7.3128834355828225, - "grad_norm": 2.504876136779785, - "learning_rate": 8.427482917962734e-07, - "loss": 0.0279, - "step": 1192 - }, - { - "epoch": 7.319018404907975, - "grad_norm": 1.569841980934143, - "learning_rate": 8.391438054351725e-07, - "loss": 0.0105, - "step": 1193 - }, - { - "epoch": 7.325153374233129, - "grad_norm": 1.218538761138916, - "learning_rate": 8.355454886659026e-07, - "loss": 0.0028, - "step": 1194 - }, - { - "epoch": 7.331288343558282, - "grad_norm": 2.084049940109253, - "learning_rate": 8.319533548551492e-07, - "loss": 0.0102, - "step": 1195 - }, - { - "epoch": 7.337423312883436, - "grad_norm": 2.326167345046997, - "learning_rate": 8.28367417346633e-07, - "loss": 0.0396, - "step": 1196 - }, - { - "epoch": 7.343558282208589, - "grad_norm": 1.2704310417175293, - "learning_rate": 8.247876894610568e-07, - "loss": 0.006, - "step": 1197 - }, - { - "epoch": 7.3496932515337425, - "grad_norm": 1.358012318611145, - "learning_rate": 8.212141844960544e-07, - "loss": 0.0075, - "step": 1198 - }, - { - "epoch": 7.355828220858895, - "grad_norm": 1.5145729780197144, - "learning_rate": 8.17646915726146e-07, - "loss": 0.0042, - "step": 1199 - }, - { - "epoch": 7.361963190184049, - "grad_norm": 1.203041911125183, - "learning_rate": 8.140858964026849e-07, - "loss": 0.0032, - "step": 1200 - }, - { - "epoch": 7.368098159509202, - "grad_norm": 3.031280279159546, - "learning_rate": 8.105311397538085e-07, - "loss": 0.032, - "step": 1201 - }, - { - "epoch": 7.374233128834356, - "grad_norm": 1.416698694229126, - "learning_rate": 8.069826589843929e-07, - "loss": 0.0185, - "step": 1202 - }, - { - "epoch": 7.38036809815951, - "grad_norm": 0.9656457901000977, - "learning_rate": 8.034404672759977e-07, - "loss": 0.0034, - "step": 1203 - }, - { - "epoch": 7.386503067484663, - "grad_norm": 1.7239291667938232, - "learning_rate": 7.99904577786823e-07, - "loss": 0.034, - "step": 1204 - }, - { - "epoch": 7.392638036809816, - "grad_norm": 1.1560636758804321, - "learning_rate": 7.963750036516585e-07, - "loss": 0.005, - "step": 1205 - }, - { - "epoch": 7.398773006134969, - "grad_norm": 1.057456374168396, - "learning_rate": 7.928517579818312e-07, - "loss": 0.0073, - "step": 1206 - }, - { - "epoch": 7.404907975460123, - "grad_norm": 1.4066674709320068, - "learning_rate": 7.893348538651635e-07, - "loss": 0.015, - "step": 1207 - }, - { - "epoch": 7.411042944785276, - "grad_norm": 1.1061445474624634, - "learning_rate": 7.858243043659161e-07, - "loss": 0.004, - "step": 1208 - }, - { - "epoch": 7.41717791411043, - "grad_norm": 0.9575282335281372, - "learning_rate": 7.823201225247496e-07, - "loss": 0.003, - "step": 1209 - }, - { - "epoch": 7.423312883435583, - "grad_norm": 1.3790507316589355, - "learning_rate": 7.788223213586677e-07, - "loss": 0.0096, - "step": 1210 - }, - { - "epoch": 7.429447852760736, - "grad_norm": 1.1366883516311646, - "learning_rate": 7.753309138609705e-07, - "loss": 0.006, - "step": 1211 - }, - { - "epoch": 7.435582822085889, - "grad_norm": 2.2659928798675537, - "learning_rate": 7.71845913001211e-07, - "loss": 0.0074, - "step": 1212 - }, - { - "epoch": 7.441717791411043, - "grad_norm": 1.2541831731796265, - "learning_rate": 7.683673317251392e-07, - "loss": 0.0051, - "step": 1213 - }, - { - "epoch": 7.447852760736196, - "grad_norm": 1.5959513187408447, - "learning_rate": 7.648951829546619e-07, - "loss": 0.0271, - "step": 1214 - }, - { - "epoch": 7.45398773006135, - "grad_norm": 1.368452548980713, - "learning_rate": 7.6142947958779e-07, - "loss": 0.0155, - "step": 1215 - }, - { - "epoch": 7.460122699386503, - "grad_norm": 1.1851825714111328, - "learning_rate": 7.579702344985899e-07, - "loss": 0.0032, - "step": 1216 - }, - { - "epoch": 7.466257668711656, - "grad_norm": 1.419812560081482, - "learning_rate": 7.545174605371403e-07, - "loss": 0.0037, - "step": 1217 - }, - { - "epoch": 7.47239263803681, - "grad_norm": 1.0817372798919678, - "learning_rate": 7.510711705294782e-07, - "loss": 0.0064, - "step": 1218 - }, - { - "epoch": 7.478527607361963, - "grad_norm": 1.0459797382354736, - "learning_rate": 7.476313772775578e-07, - "loss": 0.0055, - "step": 1219 - }, - { - "epoch": 7.484662576687117, - "grad_norm": 1.4481663703918457, - "learning_rate": 7.441980935591986e-07, - "loss": 0.0049, - "step": 1220 - }, - { - "epoch": 7.49079754601227, - "grad_norm": 1.7337101697921753, - "learning_rate": 7.407713321280377e-07, - "loss": 0.0123, - "step": 1221 - }, - { - "epoch": 7.4969325153374236, - "grad_norm": 1.3378303050994873, - "learning_rate": 7.373511057134855e-07, - "loss": 0.0056, - "step": 1222 - }, - { - "epoch": 7.5030674846625764, - "grad_norm": 2.4353835582733154, - "learning_rate": 7.339374270206772e-07, - "loss": 0.0155, - "step": 1223 - }, - { - "epoch": 7.50920245398773, - "grad_norm": 2.2856571674346924, - "learning_rate": 7.305303087304227e-07, - "loss": 0.0303, - "step": 1224 - }, - { - "epoch": 7.515337423312883, - "grad_norm": 1.0627055168151855, - "learning_rate": 7.271297634991651e-07, - "loss": 0.0018, - "step": 1225 - }, - { - "epoch": 7.521472392638037, - "grad_norm": 1.2120238542556763, - "learning_rate": 7.237358039589271e-07, - "loss": 0.0064, - "step": 1226 - }, - { - "epoch": 7.52760736196319, - "grad_norm": 1.1861765384674072, - "learning_rate": 7.203484427172702e-07, - "loss": 0.0025, - "step": 1227 - }, - { - "epoch": 7.533742331288344, - "grad_norm": 1.6700332164764404, - "learning_rate": 7.169676923572447e-07, - "loss": 0.0067, - "step": 1228 - }, - { - "epoch": 7.539877300613497, - "grad_norm": 1.4527982473373413, - "learning_rate": 7.135935654373416e-07, - "loss": 0.0082, - "step": 1229 - }, - { - "epoch": 7.54601226993865, - "grad_norm": 1.1425046920776367, - "learning_rate": 7.102260744914499e-07, - "loss": 0.0042, - "step": 1230 - }, - { - "epoch": 7.552147239263804, - "grad_norm": 2.0762295722961426, - "learning_rate": 7.068652320288081e-07, - "loss": 0.0374, - "step": 1231 - }, - { - "epoch": 7.558282208588957, - "grad_norm": 1.2008321285247803, - "learning_rate": 7.035110505339546e-07, - "loss": 0.0022, - "step": 1232 - }, - { - "epoch": 7.564417177914111, - "grad_norm": 1.262100338935852, - "learning_rate": 7.001635424666878e-07, - "loss": 0.006, - "step": 1233 - }, - { - "epoch": 7.570552147239264, - "grad_norm": 1.8173811435699463, - "learning_rate": 6.968227202620137e-07, - "loss": 0.0137, - "step": 1234 - }, - { - "epoch": 7.576687116564417, - "grad_norm": 1.6977999210357666, - "learning_rate": 6.934885963301033e-07, - "loss": 0.0216, - "step": 1235 - }, - { - "epoch": 7.58282208588957, - "grad_norm": 0.7084318399429321, - "learning_rate": 6.901611830562469e-07, - "loss": 0.0027, - "step": 1236 - }, - { - "epoch": 7.588957055214724, - "grad_norm": 2.0332374572753906, - "learning_rate": 6.868404928008035e-07, - "loss": 0.0391, - "step": 1237 - }, - { - "epoch": 7.595092024539877, - "grad_norm": 1.235734224319458, - "learning_rate": 6.835265378991613e-07, - "loss": 0.0053, - "step": 1238 - }, - { - "epoch": 7.601226993865031, - "grad_norm": 2.687920331954956, - "learning_rate": 6.802193306616858e-07, - "loss": 0.0395, - "step": 1239 - }, - { - "epoch": 7.6073619631901845, - "grad_norm": 1.4211101531982422, - "learning_rate": 6.769188833736781e-07, - "loss": 0.0055, - "step": 1240 - }, - { - "epoch": 7.613496932515337, - "grad_norm": 2.4542644023895264, - "learning_rate": 6.736252082953307e-07, - "loss": 0.0072, - "step": 1241 - }, - { - "epoch": 7.61963190184049, - "grad_norm": 1.2946943044662476, - "learning_rate": 6.703383176616743e-07, - "loss": 0.0046, - "step": 1242 - }, - { - "epoch": 7.625766871165644, - "grad_norm": 3.8073277473449707, - "learning_rate": 6.670582236825421e-07, - "loss": 0.0742, - "step": 1243 - }, - { - "epoch": 7.631901840490798, - "grad_norm": 1.4291348457336426, - "learning_rate": 6.637849385425157e-07, - "loss": 0.0069, - "step": 1244 - }, - { - "epoch": 7.638036809815951, - "grad_norm": 1.1767655611038208, - "learning_rate": 6.605184744008866e-07, - "loss": 0.0031, - "step": 1245 - }, - { - "epoch": 7.644171779141105, - "grad_norm": 1.837077260017395, - "learning_rate": 6.572588433916082e-07, - "loss": 0.0316, - "step": 1246 - }, - { - "epoch": 7.6503067484662575, - "grad_norm": 1.9157041311264038, - "learning_rate": 6.540060576232488e-07, - "loss": 0.0472, - "step": 1247 - }, - { - "epoch": 7.656441717791411, - "grad_norm": 1.7347630262374878, - "learning_rate": 6.507601291789515e-07, - "loss": 0.0059, - "step": 1248 - }, - { - "epoch": 7.662576687116564, - "grad_norm": 0.9757588505744934, - "learning_rate": 6.475210701163828e-07, - "loss": 0.0023, - "step": 1249 - }, - { - "epoch": 7.668711656441718, - "grad_norm": 1.9460281133651733, - "learning_rate": 6.442888924676951e-07, - "loss": 0.0207, - "step": 1250 - }, - { - "epoch": 7.674846625766871, - "grad_norm": 0.7517938613891602, - "learning_rate": 6.410636082394772e-07, - "loss": 0.002, - "step": 1251 - }, - { - "epoch": 7.680981595092025, - "grad_norm": 1.0631566047668457, - "learning_rate": 6.378452294127091e-07, - "loss": 0.0038, - "step": 1252 - }, - { - "epoch": 7.6871165644171775, - "grad_norm": 0.9524463415145874, - "learning_rate": 6.346337679427214e-07, - "loss": 0.0024, - "step": 1253 - }, - { - "epoch": 7.693251533742331, - "grad_norm": 1.3653123378753662, - "learning_rate": 6.314292357591489e-07, - "loss": 0.0027, - "step": 1254 - }, - { - "epoch": 7.699386503067485, - "grad_norm": 1.2446377277374268, - "learning_rate": 6.282316447658837e-07, - "loss": 0.0048, - "step": 1255 - }, - { - "epoch": 7.705521472392638, - "grad_norm": 1.716244101524353, - "learning_rate": 6.250410068410367e-07, - "loss": 0.0064, - "step": 1256 - }, - { - "epoch": 7.711656441717792, - "grad_norm": 1.7151219844818115, - "learning_rate": 6.218573338368869e-07, - "loss": 0.0056, - "step": 1257 - }, - { - "epoch": 7.717791411042945, - "grad_norm": 1.8013248443603516, - "learning_rate": 6.186806375798429e-07, - "loss": 0.0073, - "step": 1258 - }, - { - "epoch": 7.723926380368098, - "grad_norm": 1.051620602607727, - "learning_rate": 6.155109298703968e-07, - "loss": 0.0043, - "step": 1259 - }, - { - "epoch": 7.730061349693251, - "grad_norm": 1.5731337070465088, - "learning_rate": 6.123482224830787e-07, - "loss": 0.0108, - "step": 1260 - }, - { - "epoch": 7.736196319018405, - "grad_norm": 2.232144832611084, - "learning_rate": 6.091925271664156e-07, - "loss": 0.0337, - "step": 1261 - }, - { - "epoch": 7.742331288343558, - "grad_norm": 1.072678565979004, - "learning_rate": 6.060438556428877e-07, - "loss": 0.0019, - "step": 1262 - }, - { - "epoch": 7.748466257668712, - "grad_norm": 2.3631110191345215, - "learning_rate": 6.02902219608881e-07, - "loss": 0.0089, - "step": 1263 - }, - { - "epoch": 7.754601226993865, - "grad_norm": 1.1171438694000244, - "learning_rate": 5.997676307346504e-07, - "loss": 0.0045, - "step": 1264 - }, - { - "epoch": 7.7607361963190185, - "grad_norm": 0.7839979529380798, - "learning_rate": 5.966401006642689e-07, - "loss": 0.0028, - "step": 1265 - }, - { - "epoch": 7.766871165644172, - "grad_norm": 1.5938968658447266, - "learning_rate": 5.93519641015591e-07, - "loss": 0.009, - "step": 1266 - }, - { - "epoch": 7.773006134969325, - "grad_norm": 1.2980104684829712, - "learning_rate": 5.904062633802066e-07, - "loss": 0.0168, - "step": 1267 - }, - { - "epoch": 7.779141104294479, - "grad_norm": 1.177626371383667, - "learning_rate": 5.872999793233952e-07, - "loss": 0.0029, - "step": 1268 - }, - { - "epoch": 7.785276073619632, - "grad_norm": 2.0138931274414062, - "learning_rate": 5.842008003840891e-07, - "loss": 0.015, - "step": 1269 - }, - { - "epoch": 7.791411042944786, - "grad_norm": 1.7204387187957764, - "learning_rate": 5.811087380748245e-07, - "loss": 0.011, - "step": 1270 - }, - { - "epoch": 7.7975460122699385, - "grad_norm": 1.506241798400879, - "learning_rate": 5.780238038817035e-07, - "loss": 0.0057, - "step": 1271 - }, - { - "epoch": 7.803680981595092, - "grad_norm": 2.0950393676757812, - "learning_rate": 5.74946009264348e-07, - "loss": 0.0131, - "step": 1272 - }, - { - "epoch": 7.809815950920245, - "grad_norm": 2.1451432704925537, - "learning_rate": 5.71875365655859e-07, - "loss": 0.0088, - "step": 1273 - }, - { - "epoch": 7.815950920245399, - "grad_norm": 0.9690236449241638, - "learning_rate": 5.688118844627746e-07, - "loss": 0.0033, - "step": 1274 - }, - { - "epoch": 7.822085889570552, - "grad_norm": 1.5690608024597168, - "learning_rate": 5.657555770650241e-07, - "loss": 0.0206, - "step": 1275 - }, - { - "epoch": 7.828220858895706, - "grad_norm": 1.8220988512039185, - "learning_rate": 5.627064548158903e-07, - "loss": 0.0096, - "step": 1276 - }, - { - "epoch": 7.8343558282208585, - "grad_norm": 2.3800559043884277, - "learning_rate": 5.596645290419653e-07, - "loss": 0.008, - "step": 1277 - }, - { - "epoch": 7.840490797546012, - "grad_norm": 0.7775714993476868, - "learning_rate": 5.566298110431068e-07, - "loss": 0.0016, - "step": 1278 - }, - { - "epoch": 7.846625766871165, - "grad_norm": 1.1196876764297485, - "learning_rate": 5.536023120924e-07, - "loss": 0.0033, - "step": 1279 - }, - { - "epoch": 7.852760736196319, - "grad_norm": 1.3722344636917114, - "learning_rate": 5.505820434361108e-07, - "loss": 0.0084, - "step": 1280 - }, - { - "epoch": 7.858895705521473, - "grad_norm": 1.2068676948547363, - "learning_rate": 5.47569016293649e-07, - "loss": 0.0049, - "step": 1281 - }, - { - "epoch": 7.865030674846626, - "grad_norm": 1.096085548400879, - "learning_rate": 5.445632418575239e-07, - "loss": 0.0019, - "step": 1282 - }, - { - "epoch": 7.871165644171779, - "grad_norm": 1.3178106546401978, - "learning_rate": 5.415647312933015e-07, - "loss": 0.0062, - "step": 1283 - }, - { - "epoch": 7.877300613496932, - "grad_norm": 1.2884724140167236, - "learning_rate": 5.385734957395664e-07, - "loss": 0.0081, - "step": 1284 - }, - { - "epoch": 7.883435582822086, - "grad_norm": 0.9866589307785034, - "learning_rate": 5.355895463078789e-07, - "loss": 0.0048, - "step": 1285 - }, - { - "epoch": 7.889570552147239, - "grad_norm": 1.5396437644958496, - "learning_rate": 5.326128940827313e-07, - "loss": 0.0088, - "step": 1286 - }, - { - "epoch": 7.895705521472393, - "grad_norm": 1.1183607578277588, - "learning_rate": 5.296435501215116e-07, - "loss": 0.0043, - "step": 1287 - }, - { - "epoch": 7.901840490797546, - "grad_norm": 1.5337073802947998, - "learning_rate": 5.266815254544572e-07, - "loss": 0.0099, - "step": 1288 - }, - { - "epoch": 7.9079754601226995, - "grad_norm": 1.8188867568969727, - "learning_rate": 5.237268310846183e-07, - "loss": 0.0086, - "step": 1289 - }, - { - "epoch": 7.914110429447852, - "grad_norm": 1.972072720527649, - "learning_rate": 5.207794779878156e-07, - "loss": 0.0442, - "step": 1290 - }, - { - "epoch": 7.920245398773006, - "grad_norm": 1.1226261854171753, - "learning_rate": 5.178394771125969e-07, - "loss": 0.0071, - "step": 1291 - }, - { - "epoch": 7.92638036809816, - "grad_norm": 1.5612869262695312, - "learning_rate": 5.149068393802009e-07, - "loss": 0.0192, - "step": 1292 - }, - { - "epoch": 7.932515337423313, - "grad_norm": 1.1532280445098877, - "learning_rate": 5.119815756845123e-07, - "loss": 0.0032, - "step": 1293 - }, - { - "epoch": 7.938650306748467, - "grad_norm": 1.8807255029678345, - "learning_rate": 5.090636968920252e-07, - "loss": 0.0139, - "step": 1294 - }, - { - "epoch": 7.9447852760736195, - "grad_norm": 1.3027002811431885, - "learning_rate": 5.061532138418013e-07, - "loss": 0.0071, - "step": 1295 - }, - { - "epoch": 7.950920245398773, - "grad_norm": 1.584154486656189, - "learning_rate": 5.032501373454266e-07, - "loss": 0.0056, - "step": 1296 - }, - { - "epoch": 7.957055214723926, - "grad_norm": 1.7631733417510986, - "learning_rate": 5.003544781869762e-07, - "loss": 0.0239, - "step": 1297 - }, - { - "epoch": 7.96319018404908, - "grad_norm": 1.9462637901306152, - "learning_rate": 4.974662471229727e-07, - "loss": 0.0336, - "step": 1298 - }, - { - "epoch": 7.969325153374233, - "grad_norm": 1.9697695970535278, - "learning_rate": 4.945854548823425e-07, - "loss": 0.0049, - "step": 1299 - }, - { - "epoch": 7.975460122699387, - "grad_norm": 1.066036581993103, - "learning_rate": 4.917121121663823e-07, - "loss": 0.0103, - "step": 1300 - }, - { - "epoch": 7.9815950920245395, - "grad_norm": 1.0865890979766846, - "learning_rate": 4.888462296487129e-07, - "loss": 0.0036, - "step": 1301 - }, - { - "epoch": 7.987730061349693, - "grad_norm": 1.7804820537567139, - "learning_rate": 4.859878179752448e-07, - "loss": 0.0119, - "step": 1302 - }, - { - "epoch": 7.993865030674847, - "grad_norm": 2.735875129699707, - "learning_rate": 4.83136887764136e-07, - "loss": 0.0365, - "step": 1303 - }, - { - "epoch": 8.0, - "grad_norm": 1.316243290901184, - "learning_rate": 4.802934496057527e-07, - "loss": 0.0046, - "step": 1304 - }, - { - "epoch": 8.006134969325153, - "grad_norm": 2.192969560623169, - "learning_rate": 4.774575140626317e-07, - "loss": 0.0235, - "step": 1305 - }, - { - "epoch": 8.012269938650308, - "grad_norm": 0.9257994890213013, - "learning_rate": 4.746290916694368e-07, - "loss": 0.0029, - "step": 1306 - }, - { - "epoch": 8.01840490797546, - "grad_norm": 0.6933830380439758, - "learning_rate": 4.71808192932926e-07, - "loss": 0.0019, - "step": 1307 - }, - { - "epoch": 8.024539877300613, - "grad_norm": 0.4838462173938751, - "learning_rate": 4.6899482833190765e-07, - "loss": 0.0024, - "step": 1308 - }, - { - "epoch": 8.030674846625766, - "grad_norm": 1.1725589036941528, - "learning_rate": 4.661890083172019e-07, - "loss": 0.0166, - "step": 1309 - }, - { - "epoch": 8.036809815950921, - "grad_norm": 0.7732264399528503, - "learning_rate": 4.633907433116053e-07, - "loss": 0.0047, - "step": 1310 - }, - { - "epoch": 8.042944785276074, - "grad_norm": 0.6369810700416565, - "learning_rate": 4.6060004370984763e-07, - "loss": 0.0013, - "step": 1311 - }, - { - "epoch": 8.049079754601227, - "grad_norm": 0.6437183618545532, - "learning_rate": 4.5781691987855676e-07, - "loss": 0.0016, - "step": 1312 - }, - { - "epoch": 8.05521472392638, - "grad_norm": 0.40145647525787354, - "learning_rate": 4.5504138215621915e-07, - "loss": 0.0026, - "step": 1313 - }, - { - "epoch": 8.061349693251534, - "grad_norm": 1.1000946760177612, - "learning_rate": 4.5227344085313873e-07, - "loss": 0.002, - "step": 1314 - }, - { - "epoch": 8.067484662576687, - "grad_norm": 1.4580782651901245, - "learning_rate": 4.495131062514038e-07, - "loss": 0.0299, - "step": 1315 - }, - { - "epoch": 8.07361963190184, - "grad_norm": 0.9026187062263489, - "learning_rate": 4.467603886048452e-07, - "loss": 0.003, - "step": 1316 - }, - { - "epoch": 8.079754601226995, - "grad_norm": 1.2969629764556885, - "learning_rate": 4.440152981389972e-07, - "loss": 0.0129, - "step": 1317 - }, - { - "epoch": 8.085889570552148, - "grad_norm": 0.837665319442749, - "learning_rate": 4.412778450510641e-07, - "loss": 0.0086, - "step": 1318 - }, - { - "epoch": 8.0920245398773, - "grad_norm": 0.3426748216152191, - "learning_rate": 4.3854803950987736e-07, - "loss": 0.002, - "step": 1319 - }, - { - "epoch": 8.098159509202453, - "grad_norm": 0.8508721590042114, - "learning_rate": 4.358258916558611e-07, - "loss": 0.0016, - "step": 1320 - }, - { - "epoch": 8.104294478527608, - "grad_norm": 1.2476134300231934, - "learning_rate": 4.331114116009938e-07, - "loss": 0.0156, - "step": 1321 - }, - { - "epoch": 8.110429447852761, - "grad_norm": 1.036689281463623, - "learning_rate": 4.3040460942876896e-07, - "loss": 0.0021, - "step": 1322 - }, - { - "epoch": 8.116564417177914, - "grad_norm": 0.7747099995613098, - "learning_rate": 4.277054951941609e-07, - "loss": 0.0021, - "step": 1323 - }, - { - "epoch": 8.122699386503067, - "grad_norm": 1.2793506383895874, - "learning_rate": 4.250140789235829e-07, - "loss": 0.0036, - "step": 1324 - }, - { - "epoch": 8.128834355828221, - "grad_norm": 1.5389785766601562, - "learning_rate": 4.223303706148549e-07, - "loss": 0.0031, - "step": 1325 - }, - { - "epoch": 8.134969325153374, - "grad_norm": 1.549869179725647, - "learning_rate": 4.196543802371641e-07, - "loss": 0.0102, - "step": 1326 - }, - { - "epoch": 8.141104294478527, - "grad_norm": 0.862311065196991, - "learning_rate": 4.1698611773102525e-07, - "loss": 0.0023, - "step": 1327 - }, - { - "epoch": 8.14723926380368, - "grad_norm": 1.0216046571731567, - "learning_rate": 4.14325593008249e-07, - "loss": 0.0074, - "step": 1328 - }, - { - "epoch": 8.153374233128835, - "grad_norm": 0.8307499289512634, - "learning_rate": 4.1167281595190206e-07, - "loss": 0.0017, - "step": 1329 - }, - { - "epoch": 8.159509202453988, - "grad_norm": 0.5344944596290588, - "learning_rate": 4.090277964162692e-07, - "loss": 0.0013, - "step": 1330 - }, - { - "epoch": 8.16564417177914, - "grad_norm": 0.8608856201171875, - "learning_rate": 4.063905442268201e-07, - "loss": 0.0014, - "step": 1331 - }, - { - "epoch": 8.171779141104295, - "grad_norm": 0.33019620180130005, - "learning_rate": 4.037610691801694e-07, - "loss": 0.0009, - "step": 1332 - }, - { - "epoch": 8.177914110429448, - "grad_norm": 0.6515982747077942, - "learning_rate": 4.011393810440431e-07, - "loss": 0.0022, - "step": 1333 - }, - { - "epoch": 8.184049079754601, - "grad_norm": 0.9144461750984192, - "learning_rate": 3.985254895572413e-07, - "loss": 0.0024, - "step": 1334 - }, - { - "epoch": 8.190184049079754, - "grad_norm": 0.4078105390071869, - "learning_rate": 3.959194044296011e-07, - "loss": 0.0011, - "step": 1335 - }, - { - "epoch": 8.196319018404909, - "grad_norm": 0.7559608817100525, - "learning_rate": 3.9332113534196194e-07, - "loss": 0.0028, - "step": 1336 - }, - { - "epoch": 8.202453987730062, - "grad_norm": 1.3025604486465454, - "learning_rate": 3.907306919461279e-07, - "loss": 0.0228, - "step": 1337 - }, - { - "epoch": 8.208588957055214, - "grad_norm": 0.6984004974365234, - "learning_rate": 3.8814808386483385e-07, - "loss": 0.0027, - "step": 1338 - }, - { - "epoch": 8.214723926380367, - "grad_norm": 1.161498785018921, - "learning_rate": 3.855733206917095e-07, - "loss": 0.0037, - "step": 1339 - }, - { - "epoch": 8.220858895705522, - "grad_norm": 0.5357164740562439, - "learning_rate": 3.8300641199124024e-07, - "loss": 0.0011, - "step": 1340 - }, - { - "epoch": 8.226993865030675, - "grad_norm": 0.8089649677276611, - "learning_rate": 3.80447367298738e-07, - "loss": 0.0008, - "step": 1341 - }, - { - "epoch": 8.233128834355828, - "grad_norm": 0.4289240539073944, - "learning_rate": 3.77896196120299e-07, - "loss": 0.0012, - "step": 1342 - }, - { - "epoch": 8.239263803680982, - "grad_norm": 0.8666973114013672, - "learning_rate": 3.7535290793277364e-07, - "loss": 0.0047, - "step": 1343 - }, - { - "epoch": 8.245398773006135, - "grad_norm": 0.6841573715209961, - "learning_rate": 3.7281751218372965e-07, - "loss": 0.0007, - "step": 1344 - }, - { - "epoch": 8.251533742331288, - "grad_norm": 0.5588045716285706, - "learning_rate": 3.7029001829141457e-07, - "loss": 0.0018, - "step": 1345 - }, - { - "epoch": 8.257668711656441, - "grad_norm": 1.7257133722305298, - "learning_rate": 3.677704356447254e-07, - "loss": 0.0213, - "step": 1346 - }, - { - "epoch": 8.263803680981596, - "grad_norm": 0.2352600246667862, - "learning_rate": 3.6525877360316875e-07, - "loss": 0.0009, - "step": 1347 - }, - { - "epoch": 8.269938650306749, - "grad_norm": 0.9622183442115784, - "learning_rate": 3.627550414968303e-07, - "loss": 0.0132, - "step": 1348 - }, - { - "epoch": 8.276073619631902, - "grad_norm": 0.5367354154586792, - "learning_rate": 3.6025924862633814e-07, - "loss": 0.0006, - "step": 1349 - }, - { - "epoch": 8.282208588957054, - "grad_norm": 1.5134315490722656, - "learning_rate": 3.577714042628272e-07, - "loss": 0.01, - "step": 1350 - }, - { - "epoch": 8.28834355828221, - "grad_norm": 1.5052622556686401, - "learning_rate": 3.5529151764790715e-07, - "loss": 0.0031, - "step": 1351 - }, - { - "epoch": 8.294478527607362, - "grad_norm": 0.8776562809944153, - "learning_rate": 3.5281959799362775e-07, - "loss": 0.0053, - "step": 1352 - }, - { - "epoch": 8.300613496932515, - "grad_norm": 0.7919799089431763, - "learning_rate": 3.503556544824413e-07, - "loss": 0.0021, - "step": 1353 - }, - { - "epoch": 8.30674846625767, - "grad_norm": 0.7141364216804504, - "learning_rate": 3.4789969626717377e-07, - "loss": 0.0019, - "step": 1354 - }, - { - "epoch": 8.312883435582823, - "grad_norm": 1.7783756256103516, - "learning_rate": 3.454517324709858e-07, - "loss": 0.0019, - "step": 1355 - }, - { - "epoch": 8.319018404907975, - "grad_norm": 0.9534929394721985, - "learning_rate": 3.43011772187343e-07, - "loss": 0.0011, - "step": 1356 - }, - { - "epoch": 8.325153374233128, - "grad_norm": 0.4383384585380554, - "learning_rate": 3.405798244799799e-07, - "loss": 0.0006, - "step": 1357 - }, - { - "epoch": 8.331288343558283, - "grad_norm": 0.8582566976547241, - "learning_rate": 3.3815589838286535e-07, - "loss": 0.002, - "step": 1358 - }, - { - "epoch": 8.337423312883436, - "grad_norm": 0.8288223743438721, - "learning_rate": 3.3574000290017174e-07, - "loss": 0.002, - "step": 1359 - }, - { - "epoch": 8.343558282208589, - "grad_norm": 1.2074549198150635, - "learning_rate": 3.3333214700623976e-07, - "loss": 0.0153, - "step": 1360 - }, - { - "epoch": 8.349693251533742, - "grad_norm": 0.5359098315238953, - "learning_rate": 3.3093233964554464e-07, - "loss": 0.0014, - "step": 1361 - }, - { - "epoch": 8.355828220858896, - "grad_norm": 1.6650397777557373, - "learning_rate": 3.2854058973266547e-07, - "loss": 0.0107, - "step": 1362 - }, - { - "epoch": 8.36196319018405, - "grad_norm": 1.1784273386001587, - "learning_rate": 3.261569061522474e-07, - "loss": 0.0197, - "step": 1363 - }, - { - "epoch": 8.368098159509202, - "grad_norm": 0.6566861271858215, - "learning_rate": 3.237812977589738e-07, - "loss": 0.0009, - "step": 1364 - }, - { - "epoch": 8.374233128834355, - "grad_norm": 0.9043551683425903, - "learning_rate": 3.2141377337753105e-07, - "loss": 0.0026, - "step": 1365 - }, - { - "epoch": 8.38036809815951, - "grad_norm": 2.205872058868408, - "learning_rate": 3.190543418025749e-07, - "loss": 0.0533, - "step": 1366 - }, - { - "epoch": 8.386503067484663, - "grad_norm": 0.2918683886528015, - "learning_rate": 3.167030117986994e-07, - "loss": 0.0007, - "step": 1367 - }, - { - "epoch": 8.392638036809815, - "grad_norm": 0.5370535850524902, - "learning_rate": 3.143597921004027e-07, - "loss": 0.001, - "step": 1368 - }, - { - "epoch": 8.39877300613497, - "grad_norm": 1.353083610534668, - "learning_rate": 3.120246914120564e-07, - "loss": 0.002, - "step": 1369 - }, - { - "epoch": 8.404907975460123, - "grad_norm": 0.644607424736023, - "learning_rate": 3.096977184078731e-07, - "loss": 0.0025, - "step": 1370 - }, - { - "epoch": 8.411042944785276, - "grad_norm": 0.7351365089416504, - "learning_rate": 3.0737888173187067e-07, - "loss": 0.0014, - "step": 1371 - }, - { - "epoch": 8.417177914110429, - "grad_norm": 1.161787748336792, - "learning_rate": 3.050681899978464e-07, - "loss": 0.0149, - "step": 1372 - }, - { - "epoch": 8.423312883435583, - "grad_norm": 1.7568200826644897, - "learning_rate": 3.0276565178933847e-07, - "loss": 0.0178, - "step": 1373 - }, - { - "epoch": 8.429447852760736, - "grad_norm": 0.73989337682724, - "learning_rate": 3.004712756595993e-07, - "loss": 0.0053, - "step": 1374 - }, - { - "epoch": 8.43558282208589, - "grad_norm": 1.8425425291061401, - "learning_rate": 2.9818507013156085e-07, - "loss": 0.0013, - "step": 1375 - }, - { - "epoch": 8.441717791411042, - "grad_norm": 0.6374561786651611, - "learning_rate": 2.9590704369780313e-07, - "loss": 0.0039, - "step": 1376 - }, - { - "epoch": 8.447852760736197, - "grad_norm": 0.708151638507843, - "learning_rate": 2.9363720482052436e-07, - "loss": 0.0025, - "step": 1377 - }, - { - "epoch": 8.45398773006135, - "grad_norm": 1.2846306562423706, - "learning_rate": 2.91375561931507e-07, - "loss": 0.0033, - "step": 1378 - }, - { - "epoch": 8.460122699386503, - "grad_norm": 0.347720742225647, - "learning_rate": 2.89122123432089e-07, - "loss": 0.0006, - "step": 1379 - }, - { - "epoch": 8.466257668711656, - "grad_norm": 0.9626922607421875, - "learning_rate": 2.868768976931313e-07, - "loss": 0.001, - "step": 1380 - }, - { - "epoch": 8.47239263803681, - "grad_norm": 0.26909729838371277, - "learning_rate": 2.8463989305498596e-07, - "loss": 0.0008, - "step": 1381 - }, - { - "epoch": 8.478527607361963, - "grad_norm": 0.8750791549682617, - "learning_rate": 2.824111178274669e-07, - "loss": 0.0025, - "step": 1382 - }, - { - "epoch": 8.484662576687116, - "grad_norm": 1.1124992370605469, - "learning_rate": 2.801905802898183e-07, - "loss": 0.0031, - "step": 1383 - }, - { - "epoch": 8.49079754601227, - "grad_norm": 0.4871549904346466, - "learning_rate": 2.779782886906829e-07, - "loss": 0.0013, - "step": 1384 - }, - { - "epoch": 8.496932515337424, - "grad_norm": 0.5207282900810242, - "learning_rate": 2.7577425124807324e-07, - "loss": 0.0013, - "step": 1385 - }, - { - "epoch": 8.503067484662576, - "grad_norm": 1.8369935750961304, - "learning_rate": 2.7357847614933876e-07, - "loss": 0.0031, - "step": 1386 - }, - { - "epoch": 8.50920245398773, - "grad_norm": 0.6390517354011536, - "learning_rate": 2.713909715511384e-07, - "loss": 0.0045, - "step": 1387 - }, - { - "epoch": 8.515337423312884, - "grad_norm": 0.8618245124816895, - "learning_rate": 2.692117455794077e-07, - "loss": 0.0017, - "step": 1388 - }, - { - "epoch": 8.521472392638037, - "grad_norm": 0.8506134152412415, - "learning_rate": 2.6704080632932895e-07, - "loss": 0.0014, - "step": 1389 - }, - { - "epoch": 8.52760736196319, - "grad_norm": 0.42547252774238586, - "learning_rate": 2.6487816186530263e-07, - "loss": 0.002, - "step": 1390 - }, - { - "epoch": 8.533742331288344, - "grad_norm": 0.6425843834877014, - "learning_rate": 2.6272382022091704e-07, - "loss": 0.0028, - "step": 1391 - }, - { - "epoch": 8.539877300613497, - "grad_norm": 0.8287162780761719, - "learning_rate": 2.6057778939891614e-07, - "loss": 0.011, - "step": 1392 - }, - { - "epoch": 8.54601226993865, - "grad_norm": 1.0402963161468506, - "learning_rate": 2.584400773711737e-07, - "loss": 0.0037, - "step": 1393 - }, - { - "epoch": 8.552147239263803, - "grad_norm": 0.9785431623458862, - "learning_rate": 2.5631069207865926e-07, - "loss": 0.0023, - "step": 1394 - }, - { - "epoch": 8.558282208588958, - "grad_norm": 1.2661131620407104, - "learning_rate": 2.541896414314132e-07, - "loss": 0.0053, - "step": 1395 - }, - { - "epoch": 8.56441717791411, - "grad_norm": 0.2662440240383148, - "learning_rate": 2.520769333085141e-07, - "loss": 0.0008, - "step": 1396 - }, - { - "epoch": 8.570552147239264, - "grad_norm": 0.628510594367981, - "learning_rate": 2.4997257555805064e-07, - "loss": 0.001, - "step": 1397 - }, - { - "epoch": 8.576687116564417, - "grad_norm": 1.08578622341156, - "learning_rate": 2.4787657599709276e-07, - "loss": 0.0041, - "step": 1398 - }, - { - "epoch": 8.582822085889571, - "grad_norm": 0.8213603496551514, - "learning_rate": 2.4578894241166135e-07, - "loss": 0.0029, - "step": 1399 - }, - { - "epoch": 8.588957055214724, - "grad_norm": 0.5261257886886597, - "learning_rate": 2.4370968255670093e-07, - "loss": 0.001, - "step": 1400 - }, - { - "epoch": 8.595092024539877, - "grad_norm": 0.18139345943927765, - "learning_rate": 2.4163880415604913e-07, - "loss": 0.0005, - "step": 1401 - }, - { - "epoch": 8.60122699386503, - "grad_norm": 0.8317165970802307, - "learning_rate": 2.395763149024102e-07, - "loss": 0.0034, - "step": 1402 - }, - { - "epoch": 8.607361963190185, - "grad_norm": 1.272074580192566, - "learning_rate": 2.3752222245732454e-07, - "loss": 0.0036, - "step": 1403 - }, - { - "epoch": 8.613496932515337, - "grad_norm": 0.5556488633155823, - "learning_rate": 2.3547653445114032e-07, - "loss": 0.0013, - "step": 1404 - }, - { - "epoch": 8.61963190184049, - "grad_norm": 0.6546408534049988, - "learning_rate": 2.334392584829867e-07, - "loss": 0.0008, - "step": 1405 - }, - { - "epoch": 8.625766871165645, - "grad_norm": 2.021836996078491, - "learning_rate": 2.3141040212074445e-07, - "loss": 0.0198, - "step": 1406 - }, - { - "epoch": 8.631901840490798, - "grad_norm": 0.6017210483551025, - "learning_rate": 2.293899729010171e-07, - "loss": 0.0033, - "step": 1407 - }, - { - "epoch": 8.63803680981595, - "grad_norm": 0.315134733915329, - "learning_rate": 2.2737797832910498e-07, - "loss": 0.0007, - "step": 1408 - }, - { - "epoch": 8.644171779141104, - "grad_norm": 0.7090817093849182, - "learning_rate": 2.2537442587897474e-07, - "loss": 0.0045, - "step": 1409 - }, - { - "epoch": 8.650306748466258, - "grad_norm": 0.26951614022254944, - "learning_rate": 2.2337932299323434e-07, - "loss": 0.001, - "step": 1410 - }, - { - "epoch": 8.656441717791411, - "grad_norm": 0.21670447289943695, - "learning_rate": 2.2139267708310457e-07, - "loss": 0.0005, - "step": 1411 - }, - { - "epoch": 8.662576687116564, - "grad_norm": 1.070379376411438, - "learning_rate": 2.194144955283886e-07, - "loss": 0.0022, - "step": 1412 - }, - { - "epoch": 8.668711656441717, - "grad_norm": 0.7644438147544861, - "learning_rate": 2.1744478567744947e-07, - "loss": 0.0023, - "step": 1413 - }, - { - "epoch": 8.674846625766872, - "grad_norm": 1.053305983543396, - "learning_rate": 2.154835548471798e-07, - "loss": 0.0027, - "step": 1414 - }, - { - "epoch": 8.680981595092025, - "grad_norm": 0.5719135403633118, - "learning_rate": 2.1353081032297356e-07, - "loss": 0.0015, - "step": 1415 - }, - { - "epoch": 8.687116564417177, - "grad_norm": 0.3360785245895386, - "learning_rate": 2.1158655935870325e-07, - "loss": 0.0025, - "step": 1416 - }, - { - "epoch": 8.69325153374233, - "grad_norm": 0.867242693901062, - "learning_rate": 2.0965080917668744e-07, - "loss": 0.002, - "step": 1417 - }, - { - "epoch": 8.699386503067485, - "grad_norm": 1.1389360427856445, - "learning_rate": 2.077235669676689e-07, - "loss": 0.0023, - "step": 1418 - }, - { - "epoch": 8.705521472392638, - "grad_norm": 0.31157732009887695, - "learning_rate": 2.0580483989078525e-07, - "loss": 0.0005, - "step": 1419 - }, - { - "epoch": 8.71165644171779, - "grad_norm": 1.328353762626648, - "learning_rate": 2.0389463507354211e-07, - "loss": 0.0122, - "step": 1420 - }, - { - "epoch": 8.717791411042946, - "grad_norm": 0.13456307351589203, - "learning_rate": 2.0199295961178893e-07, - "loss": 0.0005, - "step": 1421 - }, - { - "epoch": 8.723926380368098, - "grad_norm": 0.7963683605194092, - "learning_rate": 2.000998205696894e-07, - "loss": 0.004, - "step": 1422 - }, - { - "epoch": 8.730061349693251, - "grad_norm": 0.1814875602722168, - "learning_rate": 1.9821522497969813e-07, - "loss": 0.0004, - "step": 1423 - }, - { - "epoch": 8.736196319018404, - "grad_norm": 0.4806751012802124, - "learning_rate": 1.9633917984253294e-07, - "loss": 0.001, - "step": 1424 - }, - { - "epoch": 8.742331288343559, - "grad_norm": 0.6554126143455505, - "learning_rate": 1.944716921271489e-07, - "loss": 0.0019, - "step": 1425 - }, - { - "epoch": 8.748466257668712, - "grad_norm": 0.7839532494544983, - "learning_rate": 1.9261276877071354e-07, - "loss": 0.0055, - "step": 1426 - }, - { - "epoch": 8.754601226993865, - "grad_norm": 1.1153522729873657, - "learning_rate": 1.9076241667857988e-07, - "loss": 0.0048, - "step": 1427 - }, - { - "epoch": 8.76073619631902, - "grad_norm": 1.4735853672027588, - "learning_rate": 1.8892064272426042e-07, - "loss": 0.0079, - "step": 1428 - }, - { - "epoch": 8.766871165644172, - "grad_norm": 0.9770727157592773, - "learning_rate": 1.8708745374940469e-07, - "loss": 0.0013, - "step": 1429 - }, - { - "epoch": 8.773006134969325, - "grad_norm": 1.5710560083389282, - "learning_rate": 1.8526285656376873e-07, - "loss": 0.0046, - "step": 1430 - }, - { - "epoch": 8.779141104294478, - "grad_norm": 0.9026464819908142, - "learning_rate": 1.8344685794519507e-07, - "loss": 0.006, - "step": 1431 - }, - { - "epoch": 8.785276073619633, - "grad_norm": 1.2195831537246704, - "learning_rate": 1.8163946463958276e-07, - "loss": 0.0094, - "step": 1432 - }, - { - "epoch": 8.791411042944786, - "grad_norm": 0.31636637449264526, - "learning_rate": 1.7984068336086652e-07, - "loss": 0.0009, - "step": 1433 - }, - { - "epoch": 8.797546012269938, - "grad_norm": 0.5591960549354553, - "learning_rate": 1.780505207909894e-07, - "loss": 0.0014, - "step": 1434 - }, - { - "epoch": 8.803680981595091, - "grad_norm": 0.5905728340148926, - "learning_rate": 1.7626898357987782e-07, - "loss": 0.0013, - "step": 1435 - }, - { - "epoch": 8.809815950920246, - "grad_norm": 1.0983483791351318, - "learning_rate": 1.744960783454186e-07, - "loss": 0.0024, - "step": 1436 - }, - { - "epoch": 8.815950920245399, - "grad_norm": 0.7398350238800049, - "learning_rate": 1.727318116734328e-07, - "loss": 0.0015, - "step": 1437 - }, - { - "epoch": 8.822085889570552, - "grad_norm": 0.4621620774269104, - "learning_rate": 1.7097619011765127e-07, - "loss": 0.0017, - "step": 1438 - }, - { - "epoch": 8.828220858895705, - "grad_norm": 0.8077200055122375, - "learning_rate": 1.6922922019969145e-07, - "loss": 0.0009, - "step": 1439 - }, - { - "epoch": 8.83435582822086, - "grad_norm": 0.7134829163551331, - "learning_rate": 1.6749090840903233e-07, - "loss": 0.0013, - "step": 1440 - }, - { - "epoch": 8.840490797546012, - "grad_norm": 1.2837457656860352, - "learning_rate": 1.6576126120299046e-07, - "loss": 0.0029, - "step": 1441 - }, - { - "epoch": 8.846625766871165, - "grad_norm": 0.8713163137435913, - "learning_rate": 1.6404028500669633e-07, - "loss": 0.0034, - "step": 1442 - }, - { - "epoch": 8.85276073619632, - "grad_norm": 0.5622571706771851, - "learning_rate": 1.6232798621306918e-07, - "loss": 0.0022, - "step": 1443 - }, - { - "epoch": 8.858895705521473, - "grad_norm": 2.460902214050293, - "learning_rate": 1.606243711827951e-07, - "loss": 0.0329, - "step": 1444 - }, - { - "epoch": 8.865030674846626, - "grad_norm": 1.5952033996582031, - "learning_rate": 1.5892944624430334e-07, - "loss": 0.0092, - "step": 1445 - }, - { - "epoch": 8.871165644171779, - "grad_norm": 0.16087445616722107, - "learning_rate": 1.5724321769374023e-07, - "loss": 0.0005, - "step": 1446 - }, - { - "epoch": 8.877300613496933, - "grad_norm": 0.33085283637046814, - "learning_rate": 1.5556569179494857e-07, - "loss": 0.0005, - "step": 1447 - }, - { - "epoch": 8.883435582822086, - "grad_norm": 0.15866753458976746, - "learning_rate": 1.538968747794431e-07, - "loss": 0.0004, - "step": 1448 - }, - { - "epoch": 8.889570552147239, - "grad_norm": 1.0744353532791138, - "learning_rate": 1.5223677284638805e-07, - "loss": 0.0046, - "step": 1449 - }, - { - "epoch": 8.895705521472392, - "grad_norm": 0.8372928500175476, - "learning_rate": 1.5058539216257356e-07, - "loss": 0.0048, - "step": 1450 - }, - { - "epoch": 8.901840490797547, - "grad_norm": 1.0015332698822021, - "learning_rate": 1.4894273886239208e-07, - "loss": 0.0027, - "step": 1451 - }, - { - "epoch": 8.9079754601227, - "grad_norm": 1.1478570699691772, - "learning_rate": 1.473088190478178e-07, - "loss": 0.0134, - "step": 1452 - }, - { - "epoch": 8.914110429447852, - "grad_norm": 0.8685131669044495, - "learning_rate": 1.4568363878838087e-07, - "loss": 0.0024, - "step": 1453 - }, - { - "epoch": 8.920245398773005, - "grad_norm": 0.46051493287086487, - "learning_rate": 1.4406720412114828e-07, - "loss": 0.0019, - "step": 1454 - }, - { - "epoch": 8.92638036809816, - "grad_norm": 0.75945645570755, - "learning_rate": 1.4245952105069905e-07, - "loss": 0.0015, - "step": 1455 - }, - { - "epoch": 8.932515337423313, - "grad_norm": 1.2880934476852417, - "learning_rate": 1.4086059554910186e-07, - "loss": 0.0045, - "step": 1456 - }, - { - "epoch": 8.938650306748466, - "grad_norm": 0.2242523580789566, - "learning_rate": 1.3927043355589476e-07, - "loss": 0.0011, - "step": 1457 - }, - { - "epoch": 8.94478527607362, - "grad_norm": 1.0341970920562744, - "learning_rate": 1.3768904097806153e-07, - "loss": 0.0019, - "step": 1458 - }, - { - "epoch": 8.950920245398773, - "grad_norm": 0.8955618739128113, - "learning_rate": 1.361164236900092e-07, - "loss": 0.0027, - "step": 1459 - }, - { - "epoch": 8.957055214723926, - "grad_norm": 1.3581833839416504, - "learning_rate": 1.3455258753354932e-07, - "loss": 0.0048, - "step": 1460 - }, - { - "epoch": 8.963190184049079, - "grad_norm": 1.5094419717788696, - "learning_rate": 1.3299753831787193e-07, - "loss": 0.0011, - "step": 1461 - }, - { - "epoch": 8.969325153374234, - "grad_norm": 0.5978104472160339, - "learning_rate": 1.3145128181952737e-07, - "loss": 0.0018, - "step": 1462 - }, - { - "epoch": 8.975460122699387, - "grad_norm": 0.7072922587394714, - "learning_rate": 1.2991382378240325e-07, - "loss": 0.0032, - "step": 1463 - }, - { - "epoch": 8.98159509202454, - "grad_norm": 0.5541467666625977, - "learning_rate": 1.2838516991770355e-07, - "loss": 0.001, - "step": 1464 - }, - { - "epoch": 8.987730061349692, - "grad_norm": 0.6946907043457031, - "learning_rate": 1.2686532590392763e-07, - "loss": 0.0024, - "step": 1465 - }, - { - "epoch": 8.993865030674847, - "grad_norm": 0.3228455185890198, - "learning_rate": 1.2535429738684822e-07, - "loss": 0.0007, - "step": 1466 - }, - { - "epoch": 9.0, - "grad_norm": 2.4403252601623535, - "learning_rate": 1.238520899794915e-07, - "loss": 0.0245, - "step": 1467 - }, - { - "epoch": 9.006134969325153, - "grad_norm": 2.5279674530029297, - "learning_rate": 1.223587092621162e-07, - "loss": 0.0006, - "step": 1468 - }, - { - "epoch": 9.012269938650308, - "grad_norm": 0.08804622292518616, - "learning_rate": 1.2087416078219144e-07, - "loss": 0.0005, - "step": 1469 - }, - { - "epoch": 9.01840490797546, - "grad_norm": 0.11985688656568527, - "learning_rate": 1.1939845005437823e-07, - "loss": 0.0006, - "step": 1470 - }, - { - "epoch": 9.024539877300613, - "grad_norm": 0.08172235637903214, - "learning_rate": 1.1793158256050708e-07, - "loss": 0.0004, - "step": 1471 - }, - { - "epoch": 9.030674846625766, - "grad_norm": 0.14893503487110138, - "learning_rate": 1.1647356374955926e-07, - "loss": 0.0005, - "step": 1472 - }, - { - "epoch": 9.036809815950921, - "grad_norm": 0.1922188401222229, - "learning_rate": 1.1502439903764539e-07, - "loss": 0.0012, - "step": 1473 - }, - { - "epoch": 9.042944785276074, - "grad_norm": 0.2091587781906128, - "learning_rate": 1.1358409380798547e-07, - "loss": 0.0004, - "step": 1474 - }, - { - "epoch": 9.049079754601227, - "grad_norm": 0.3777543008327484, - "learning_rate": 1.1215265341089021e-07, - "loss": 0.0031, - "step": 1475 - }, - { - "epoch": 9.05521472392638, - "grad_norm": 0.12114719301462173, - "learning_rate": 1.1073008316373812e-07, - "loss": 0.0004, - "step": 1476 - }, - { - "epoch": 9.061349693251534, - "grad_norm": 0.7613732218742371, - "learning_rate": 1.093163883509596e-07, - "loss": 0.0056, - "step": 1477 - }, - { - "epoch": 9.067484662576687, - "grad_norm": 0.11271879076957703, - "learning_rate": 1.0791157422401499e-07, - "loss": 0.0006, - "step": 1478 - }, - { - "epoch": 9.07361963190184, - "grad_norm": 0.5275444984436035, - "learning_rate": 1.0651564600137443e-07, - "loss": 0.0013, - "step": 1479 - }, - { - "epoch": 9.079754601226995, - "grad_norm": 0.0763268992304802, - "learning_rate": 1.051286088685008e-07, - "loss": 0.0004, - "step": 1480 - }, - { - "epoch": 9.085889570552148, - "grad_norm": 0.5255539417266846, - "learning_rate": 1.0375046797782868e-07, - "loss": 0.0047, - "step": 1481 - }, - { - "epoch": 9.0920245398773, - "grad_norm": 0.0961274728178978, - "learning_rate": 1.0238122844874576e-07, - "loss": 0.0004, - "step": 1482 - }, - { - "epoch": 9.098159509202453, - "grad_norm": 0.18914999067783356, - "learning_rate": 1.0102089536757398e-07, - "loss": 0.0011, - "step": 1483 - }, - { - "epoch": 9.104294478527608, - "grad_norm": 0.14239318668842316, - "learning_rate": 9.966947378754949e-08, - "loss": 0.0011, - "step": 1484 - }, - { - "epoch": 9.110429447852761, - "grad_norm": 0.12115265429019928, - "learning_rate": 9.83269687288066e-08, - "loss": 0.0007, - "step": 1485 - }, - { - "epoch": 9.116564417177914, - "grad_norm": 0.12038591504096985, - "learning_rate": 9.699338517835611e-08, - "loss": 0.0005, - "step": 1486 - }, - { - "epoch": 9.122699386503067, - "grad_norm": 0.07863178849220276, - "learning_rate": 9.566872809006783e-08, - "loss": 0.0004, - "step": 1487 - }, - { - "epoch": 9.128834355828221, - "grad_norm": 0.19755667448043823, - "learning_rate": 9.435300238465339e-08, - "loss": 0.0007, - "step": 1488 - }, - { - "epoch": 9.134969325153374, - "grad_norm": 0.08695468306541443, - "learning_rate": 9.30462129496465e-08, - "loss": 0.0003, - "step": 1489 - }, - { - "epoch": 9.141104294478527, - "grad_norm": 0.22066617012023926, - "learning_rate": 9.174836463938464e-08, - "loss": 0.0011, - "step": 1490 - }, - { - "epoch": 9.14723926380368, - "grad_norm": 0.15969769656658173, - "learning_rate": 9.045946227499298e-08, - "loss": 0.0012, - "step": 1491 - }, - { - "epoch": 9.153374233128835, - "grad_norm": 0.31097984313964844, - "learning_rate": 8.917951064436382e-08, - "loss": 0.0015, - "step": 1492 - }, - { - "epoch": 9.159509202453988, - "grad_norm": 0.15080022811889648, - "learning_rate": 8.790851450214106e-08, - "loss": 0.0009, - "step": 1493 - }, - { - "epoch": 9.16564417177914, - "grad_norm": 0.11880502849817276, - "learning_rate": 8.664647856970076e-08, - "loss": 0.0007, - "step": 1494 - }, - { - "epoch": 9.171779141104295, - "grad_norm": 0.6681945323944092, - "learning_rate": 8.539340753513508e-08, - "loss": 0.0046, - "step": 1495 - }, - { - "epoch": 9.177914110429448, - "grad_norm": 1.5142796039581299, - "learning_rate": 8.414930605323445e-08, - "loss": 0.0442, - "step": 1496 - }, - { - "epoch": 9.184049079754601, - "grad_norm": 0.36349135637283325, - "learning_rate": 8.291417874546875e-08, - "loss": 0.0019, - "step": 1497 - }, - { - "epoch": 9.190184049079754, - "grad_norm": 0.5278675556182861, - "learning_rate": 8.168803019997312e-08, - "loss": 0.0009, - "step": 1498 - }, - { - "epoch": 9.196319018404909, - "grad_norm": 0.08181502670049667, - "learning_rate": 8.047086497152801e-08, - "loss": 0.0004, - "step": 1499 - }, - { - "epoch": 9.202453987730062, - "grad_norm": 0.22418726980686188, - "learning_rate": 7.926268758154416e-08, - "loss": 0.0014, - "step": 1500 - }, - { - "epoch": 9.208588957055214, - "grad_norm": 0.27877968549728394, - "learning_rate": 7.806350251804484e-08, - "loss": 0.001, - "step": 1501 - }, - { - "epoch": 9.214723926380367, - "grad_norm": 0.3604774475097656, - "learning_rate": 7.687331423564925e-08, - "loss": 0.0006, - "step": 1502 - }, - { - "epoch": 9.220858895705522, - "grad_norm": 0.09796755015850067, - "learning_rate": 7.569212715555663e-08, - "loss": 0.0005, - "step": 1503 - }, - { - "epoch": 9.226993865030675, - "grad_norm": 0.12454013526439667, - "learning_rate": 7.451994566552989e-08, - "loss": 0.0006, - "step": 1504 - }, - { - "epoch": 9.233128834355828, - "grad_norm": 0.13127478957176208, - "learning_rate": 7.335677411987734e-08, - "loss": 0.0006, - "step": 1505 - }, - { - "epoch": 9.239263803680982, - "grad_norm": 0.68902587890625, - "learning_rate": 7.220261683943935e-08, - "loss": 0.0037, - "step": 1506 - }, - { - "epoch": 9.245398773006135, - "grad_norm": 0.3021928071975708, - "learning_rate": 7.105747811156999e-08, - "loss": 0.001, - "step": 1507 - }, - { - "epoch": 9.251533742331288, - "grad_norm": 0.16254237294197083, - "learning_rate": 6.992136219012263e-08, - "loss": 0.0008, - "step": 1508 - }, - { - "epoch": 9.257668711656441, - "grad_norm": 0.22068247199058533, - "learning_rate": 6.879427329543414e-08, - "loss": 0.001, - "step": 1509 - }, - { - "epoch": 9.263803680981596, - "grad_norm": 0.20256245136260986, - "learning_rate": 6.76762156143071e-08, - "loss": 0.0014, - "step": 1510 - }, - { - "epoch": 9.269938650306749, - "grad_norm": 0.06691748648881912, - "learning_rate": 6.6567193299997e-08, - "loss": 0.0003, - "step": 1511 - }, - { - "epoch": 9.276073619631902, - "grad_norm": 0.12188060581684113, - "learning_rate": 6.546721047219568e-08, - "loss": 0.0003, - "step": 1512 - }, - { - "epoch": 9.282208588957054, - "grad_norm": 0.11017973721027374, - "learning_rate": 6.437627121701456e-08, - "loss": 0.0007, - "step": 1513 - }, - { - "epoch": 9.28834355828221, - "grad_norm": 0.08906184136867523, - "learning_rate": 6.329437958697282e-08, - "loss": 0.0005, - "step": 1514 - }, - { - "epoch": 9.294478527607362, - "grad_norm": 0.10575949400663376, - "learning_rate": 6.222153960097871e-08, - "loss": 0.0004, - "step": 1515 - }, - { - "epoch": 9.300613496932515, - "grad_norm": 0.07783909887075424, - "learning_rate": 6.115775524431711e-08, - "loss": 0.0004, - "step": 1516 - }, - { - "epoch": 9.30674846625767, - "grad_norm": 0.22752316296100616, - "learning_rate": 6.010303046863397e-08, - "loss": 0.0008, - "step": 1517 - }, - { - "epoch": 9.312883435582823, - "grad_norm": 0.4781089425086975, - "learning_rate": 5.905736919192107e-08, - "loss": 0.0044, - "step": 1518 - }, - { - "epoch": 9.319018404907975, - "grad_norm": 1.2014552354812622, - "learning_rate": 5.8020775298502085e-08, - "loss": 0.0016, - "step": 1519 - }, - { - "epoch": 9.325153374233128, - "grad_norm": 0.11146771907806396, - "learning_rate": 5.699325263901878e-08, - "loss": 0.0004, - "step": 1520 - }, - { - "epoch": 9.331288343558283, - "grad_norm": 0.21041418612003326, - "learning_rate": 5.597480503041486e-08, - "loss": 0.0016, - "step": 1521 - }, - { - "epoch": 9.337423312883436, - "grad_norm": 0.1907602846622467, - "learning_rate": 5.496543625592321e-08, - "loss": 0.0006, - "step": 1522 - }, - { - "epoch": 9.343558282208589, - "grad_norm": 0.7976323962211609, - "learning_rate": 5.396515006505204e-08, - "loss": 0.001, - "step": 1523 - }, - { - "epoch": 9.349693251533742, - "grad_norm": 0.10006821155548096, - "learning_rate": 5.297395017357015e-08, - "loss": 0.0004, - "step": 1524 - }, - { - "epoch": 9.355828220858896, - "grad_norm": 0.09137666970491409, - "learning_rate": 5.199184026349308e-08, - "loss": 0.0005, - "step": 1525 - }, - { - "epoch": 9.36196319018405, - "grad_norm": 0.5621616244316101, - "learning_rate": 5.1018823983070285e-08, - "loss": 0.0014, - "step": 1526 - }, - { - "epoch": 9.368098159509202, - "grad_norm": 0.12934303283691406, - "learning_rate": 5.005490494677051e-08, - "loss": 0.0009, - "step": 1527 - }, - { - "epoch": 9.374233128834355, - "grad_norm": 0.13988590240478516, - "learning_rate": 4.91000867352695e-08, - "loss": 0.0006, - "step": 1528 - }, - { - "epoch": 9.38036809815951, - "grad_norm": 0.19421879947185516, - "learning_rate": 4.815437289543562e-08, - "loss": 0.0006, - "step": 1529 - }, - { - "epoch": 9.386503067484663, - "grad_norm": 0.278499960899353, - "learning_rate": 4.7217766940317326e-08, - "loss": 0.0018, - "step": 1530 - }, - { - "epoch": 9.392638036809815, - "grad_norm": 0.12389005720615387, - "learning_rate": 4.629027234912986e-08, - "loss": 0.0007, - "step": 1531 - }, - { - "epoch": 9.39877300613497, - "grad_norm": 0.1303948014974594, - "learning_rate": 4.5371892567243336e-08, - "loss": 0.0004, - "step": 1532 - }, - { - "epoch": 9.404907975460123, - "grad_norm": 1.117344856262207, - "learning_rate": 4.4462631006167714e-08, - "loss": 0.0169, - "step": 1533 - }, - { - "epoch": 9.411042944785276, - "grad_norm": 0.1710042506456375, - "learning_rate": 4.356249104354199e-08, - "loss": 0.0005, - "step": 1534 - }, - { - "epoch": 9.417177914110429, - "grad_norm": 0.37792330980300903, - "learning_rate": 4.267147602312116e-08, - "loss": 0.0018, - "step": 1535 - }, - { - "epoch": 9.423312883435583, - "grad_norm": 0.42278361320495605, - "learning_rate": 4.178958925476401e-08, - "loss": 0.0028, - "step": 1536 - }, - { - "epoch": 9.429447852760736, - "grad_norm": 0.9310070872306824, - "learning_rate": 4.0916834014420036e-08, - "loss": 0.0124, - "step": 1537 - }, - { - "epoch": 9.43558282208589, - "grad_norm": 0.9287325739860535, - "learning_rate": 4.0053213544118116e-08, - "loss": 0.0131, - "step": 1538 - }, - { - "epoch": 9.441717791411042, - "grad_norm": 0.2695760130882263, - "learning_rate": 3.919873105195371e-08, - "loss": 0.0014, - "step": 1539 - }, - { - "epoch": 9.447852760736197, - "grad_norm": 0.2679222524166107, - "learning_rate": 3.8353389712078583e-08, - "loss": 0.0012, - "step": 1540 - }, - { - "epoch": 9.45398773006135, - "grad_norm": 0.7153877019882202, - "learning_rate": 3.7517192664685844e-08, - "loss": 0.0102, - "step": 1541 - }, - { - "epoch": 9.460122699386503, - "grad_norm": 0.19710485637187958, - "learning_rate": 3.6690143016002155e-08, - "loss": 0.0006, - "step": 1542 - }, - { - "epoch": 9.466257668711656, - "grad_norm": 0.4529936611652374, - "learning_rate": 3.587224383827331e-08, - "loss": 0.0035, - "step": 1543 - }, - { - "epoch": 9.47239263803681, - "grad_norm": 0.22579027712345123, - "learning_rate": 3.506349816975368e-08, - "loss": 0.0015, - "step": 1544 - }, - { - "epoch": 9.478527607361963, - "grad_norm": 0.08603110164403915, - "learning_rate": 3.426390901469595e-08, - "loss": 0.0004, - "step": 1545 - }, - { - "epoch": 9.484662576687116, - "grad_norm": 0.19130398333072662, - "learning_rate": 3.347347934333778e-08, - "loss": 0.0015, - "step": 1546 - }, - { - "epoch": 9.49079754601227, - "grad_norm": 0.8941642045974731, - "learning_rate": 3.2692212091893215e-08, - "loss": 0.012, - "step": 1547 - }, - { - "epoch": 9.496932515337424, - "grad_norm": 0.09985413402318954, - "learning_rate": 3.192011016253965e-08, - "loss": 0.0003, - "step": 1548 - }, - { - "epoch": 9.503067484662576, - "grad_norm": 0.15109865367412567, - "learning_rate": 3.115717642340893e-08, - "loss": 0.0004, - "step": 1549 - }, - { - "epoch": 9.50920245398773, - "grad_norm": 0.15993481874465942, - "learning_rate": 3.040341370857486e-08, - "loss": 0.0008, - "step": 1550 - }, - { - "epoch": 9.515337423312884, - "grad_norm": 0.13720917701721191, - "learning_rate": 2.9658824818044328e-08, - "loss": 0.0005, - "step": 1551 - }, - { - "epoch": 9.521472392638037, - "grad_norm": 0.06803212314844131, - "learning_rate": 2.8923412517745662e-08, - "loss": 0.0002, - "step": 1552 - }, - { - "epoch": 9.52760736196319, - "grad_norm": 0.08404265344142914, - "learning_rate": 2.819717953951917e-08, - "loss": 0.0004, - "step": 1553 - }, - { - "epoch": 9.533742331288344, - "grad_norm": 0.5321061015129089, - "learning_rate": 2.7480128581106602e-08, - "loss": 0.0065, - "step": 1554 - }, - { - "epoch": 9.539877300613497, - "grad_norm": 0.5247214436531067, - "learning_rate": 2.6772262306141438e-08, - "loss": 0.0018, - "step": 1555 - }, - { - "epoch": 9.54601226993865, - "grad_norm": 0.2725308835506439, - "learning_rate": 2.607358334413779e-08, - "loss": 0.0006, - "step": 1556 - }, - { - "epoch": 9.552147239263803, - "grad_norm": 0.5589315295219421, - "learning_rate": 2.5384094290482886e-08, - "loss": 0.0006, - "step": 1557 - }, - { - "epoch": 9.558282208588958, - "grad_norm": 0.6117374897003174, - "learning_rate": 2.4703797706425725e-08, - "loss": 0.0068, - "step": 1558 - }, - { - "epoch": 9.56441717791411, - "grad_norm": 0.3439452648162842, - "learning_rate": 2.4032696119067332e-08, - "loss": 0.0014, - "step": 1559 - }, - { - "epoch": 9.570552147239264, - "grad_norm": 0.1743037849664688, - "learning_rate": 2.337079202135273e-08, - "loss": 0.0011, - "step": 1560 - }, - { - "epoch": 9.576687116564417, - "grad_norm": 0.6570950746536255, - "learning_rate": 2.2718087872060925e-08, - "loss": 0.0025, - "step": 1561 - }, - { - "epoch": 9.582822085889571, - "grad_norm": 0.25107917189598083, - "learning_rate": 2.207458609579549e-08, - "loss": 0.0021, - "step": 1562 - }, - { - "epoch": 9.588957055214724, - "grad_norm": 0.13917990028858185, - "learning_rate": 2.144028908297624e-08, - "loss": 0.0007, - "step": 1563 - }, - { - "epoch": 9.595092024539877, - "grad_norm": 0.22606755793094635, - "learning_rate": 2.081519918982977e-08, - "loss": 0.0014, - "step": 1564 - }, - { - "epoch": 9.60122699386503, - "grad_norm": 0.4116940200328827, - "learning_rate": 2.019931873838088e-08, - "loss": 0.0008, - "step": 1565 - }, - { - "epoch": 9.607361963190185, - "grad_norm": 0.10428212583065033, - "learning_rate": 1.9592650016444503e-08, - "loss": 0.0005, - "step": 1566 - }, - { - "epoch": 9.613496932515337, - "grad_norm": 0.0740678682923317, - "learning_rate": 1.8995195277616284e-08, - "loss": 0.0004, - "step": 1567 - }, - { - "epoch": 9.61963190184049, - "grad_norm": 0.07690935581922531, - "learning_rate": 1.8406956741264247e-08, - "loss": 0.0004, - "step": 1568 - }, - { - "epoch": 9.625766871165645, - "grad_norm": 0.14602801203727722, - "learning_rate": 1.7827936592521856e-08, - "loss": 0.0014, - "step": 1569 - }, - { - "epoch": 9.631901840490798, - "grad_norm": 0.4051103889942169, - "learning_rate": 1.7258136982278296e-08, - "loss": 0.0009, - "step": 1570 - }, - { - "epoch": 9.63803680981595, - "grad_norm": 0.331938236951828, - "learning_rate": 1.6697560027171543e-08, - "loss": 0.0019, - "step": 1571 - }, - { - "epoch": 9.644171779141104, - "grad_norm": 0.6029168367385864, - "learning_rate": 1.6146207809579762e-08, - "loss": 0.0072, - "step": 1572 - }, - { - "epoch": 9.650306748466258, - "grad_norm": 0.2004910558462143, - "learning_rate": 1.5604082377614072e-08, - "loss": 0.001, - "step": 1573 - }, - { - "epoch": 9.656441717791411, - "grad_norm": 0.33825960755348206, - "learning_rate": 1.507118574511135e-08, - "loss": 0.0017, - "step": 1574 - }, - { - "epoch": 9.662576687116564, - "grad_norm": 0.7193265557289124, - "learning_rate": 1.454751989162506e-08, - "loss": 0.0106, - "step": 1575 - }, - { - "epoch": 9.668711656441717, - "grad_norm": 0.1846141815185547, - "learning_rate": 1.4033086762419989e-08, - "loss": 0.0004, - "step": 1576 - }, - { - "epoch": 9.674846625766872, - "grad_norm": 0.2692915201187134, - "learning_rate": 1.3527888268463907e-08, - "loss": 0.002, - "step": 1577 - }, - { - "epoch": 9.680981595092025, - "grad_norm": 0.19888080656528473, - "learning_rate": 1.303192628642036e-08, - "loss": 0.0007, - "step": 1578 - }, - { - "epoch": 9.687116564417177, - "grad_norm": 0.09299040585756302, - "learning_rate": 1.2545202658642008e-08, - "loss": 0.0004, - "step": 1579 - }, - { - "epoch": 9.69325153374233, - "grad_norm": 0.12221895903348923, - "learning_rate": 1.2067719193163962e-08, - "loss": 0.0005, - "step": 1580 - }, - { - "epoch": 9.699386503067485, - "grad_norm": 0.9425249099731445, - "learning_rate": 1.1599477663696845e-08, - "loss": 0.0062, - "step": 1581 - }, - { - "epoch": 9.705521472392638, - "grad_norm": 0.1449192315340042, - "learning_rate": 1.1140479809619576e-08, - "loss": 0.0005, - "step": 1582 - }, - { - "epoch": 9.71165644171779, - "grad_norm": 0.2106281816959381, - "learning_rate": 1.069072733597465e-08, - "loss": 0.0007, - "step": 1583 - }, - { - "epoch": 9.717791411042946, - "grad_norm": 0.06777317076921463, - "learning_rate": 1.025022191346009e-08, - "loss": 0.0003, - "step": 1584 - }, - { - "epoch": 9.723926380368098, - "grad_norm": 0.6169402003288269, - "learning_rate": 9.818965178423345e-09, - "loss": 0.0083, - "step": 1585 - }, - { - "epoch": 9.730061349693251, - "grad_norm": 0.18353499472141266, - "learning_rate": 9.396958732856843e-09, - "loss": 0.001, - "step": 1586 - }, - { - "epoch": 9.736196319018404, - "grad_norm": 0.18419475853443146, - "learning_rate": 8.984204144389941e-09, - "loss": 0.0006, - "step": 1587 - }, - { - "epoch": 9.742331288343559, - "grad_norm": 0.8530840277671814, - "learning_rate": 8.580702946284491e-09, - "loss": 0.0109, - "step": 1588 - }, - { - "epoch": 9.748466257668712, - "grad_norm": 0.6887766122817993, - "learning_rate": 8.186456637428453e-09, - "loss": 0.0016, - "step": 1589 - }, - { - "epoch": 9.754601226993865, - "grad_norm": 0.1355009824037552, - "learning_rate": 7.801466682331172e-09, - "loss": 0.0004, - "step": 1590 - }, - { - "epoch": 9.76073619631902, - "grad_norm": 1.123541235923767, - "learning_rate": 7.425734511117e-09, - "loss": 0.008, - "step": 1591 - }, - { - "epoch": 9.766871165644172, - "grad_norm": 0.6276746988296509, - "learning_rate": 7.059261519520022e-09, - "loss": 0.003, - "step": 1592 - }, - { - "epoch": 9.773006134969325, - "grad_norm": 0.5775916576385498, - "learning_rate": 6.702049068879613e-09, - "loss": 0.0009, - "step": 1593 - }, - { - "epoch": 9.779141104294478, - "grad_norm": 0.44135305285453796, - "learning_rate": 6.354098486135163e-09, - "loss": 0.0049, - "step": 1594 - }, - { - "epoch": 9.785276073619633, - "grad_norm": 0.06254208087921143, - "learning_rate": 6.015411063820253e-09, - "loss": 0.0003, - "step": 1595 - }, - { - "epoch": 9.791411042944786, - "grad_norm": 0.19917500019073486, - "learning_rate": 5.685988060059045e-09, - "loss": 0.001, - "step": 1596 - }, - { - "epoch": 9.797546012269938, - "grad_norm": 0.25946539640426636, - "learning_rate": 5.36583069856128e-09, - "loss": 0.0012, - "step": 1597 - }, - { - "epoch": 9.803680981595091, - "grad_norm": 0.11085817962884903, - "learning_rate": 5.054940168617018e-09, - "loss": 0.0005, - "step": 1598 - }, - { - "epoch": 9.809815950920246, - "grad_norm": 0.07764281332492828, - "learning_rate": 4.753317625093013e-09, - "loss": 0.0002, - "step": 1599 - }, - { - "epoch": 9.815950920245399, - "grad_norm": 0.13678377866744995, - "learning_rate": 4.4609641884285625e-09, - "loss": 0.0007, - "step": 1600 - }, - { - "epoch": 9.822085889570552, - "grad_norm": 0.07325509935617447, - "learning_rate": 4.17788094463023e-09, - "loss": 0.0004, - "step": 1601 - }, - { - "epoch": 9.828220858895705, - "grad_norm": 0.745182991027832, - "learning_rate": 3.904068945269346e-09, - "loss": 0.0006, - "step": 1602 - }, - { - "epoch": 9.83435582822086, - "grad_norm": 0.23189403116703033, - "learning_rate": 3.639529207476733e-09, - "loss": 0.0015, - "step": 1603 - }, - { - "epoch": 9.840490797546012, - "grad_norm": 0.06897032260894775, - "learning_rate": 3.384262713939379e-09, - "loss": 0.0004, - "step": 1604 - }, - { - "epoch": 9.846625766871165, - "grad_norm": 0.0821717157959938, - "learning_rate": 3.1382704128973818e-09, - "loss": 0.0004, - "step": 1605 - }, - { - "epoch": 9.85276073619632, - "grad_norm": 0.8984095454216003, - "learning_rate": 2.9015532181397854e-09, - "loss": 0.0007, - "step": 1606 - }, - { - "epoch": 9.858895705521473, - "grad_norm": 0.2612057626247406, - "learning_rate": 2.674112009000973e-09, - "loss": 0.0021, - "step": 1607 - }, - { - "epoch": 9.865030674846626, - "grad_norm": 0.10079237073659897, - "learning_rate": 2.4559476303584463e-09, - "loss": 0.0004, - "step": 1608 - }, - { - "epoch": 9.871165644171779, - "grad_norm": 0.15463407337665558, - "learning_rate": 2.2470608926283833e-09, - "loss": 0.0004, - "step": 1609 - }, - { - "epoch": 9.877300613496933, - "grad_norm": 0.3247759938240051, - "learning_rate": 2.0474525717639747e-09, - "loss": 0.0008, - "step": 1610 - }, - { - "epoch": 9.883435582822086, - "grad_norm": 0.5771990418434143, - "learning_rate": 1.857123409250705e-09, - "loss": 0.0035, - "step": 1611 - }, - { - "epoch": 9.889570552147239, - "grad_norm": 0.6151068210601807, - "learning_rate": 1.6760741121057966e-09, - "loss": 0.008, - "step": 1612 - }, - { - "epoch": 9.895705521472392, - "grad_norm": 0.6173699498176575, - "learning_rate": 1.504305352874047e-09, - "loss": 0.0009, - "step": 1613 - }, - { - "epoch": 9.901840490797547, - "grad_norm": 0.07602877169847488, - "learning_rate": 1.3418177696256086e-09, - "loss": 0.0003, - "step": 1614 - }, - { - "epoch": 9.9079754601227, - "grad_norm": 0.11126144230365753, - "learning_rate": 1.1886119659543227e-09, - "loss": 0.0005, - "step": 1615 - }, - { - "epoch": 9.914110429447852, - "grad_norm": 0.13721120357513428, - "learning_rate": 1.0446885109746673e-09, - "loss": 0.0008, - "step": 1616 - }, - { - "epoch": 9.920245398773005, - "grad_norm": 0.3714880049228668, - "learning_rate": 9.100479393195361e-10, - "loss": 0.0033, - "step": 1617 - }, - { - "epoch": 9.92638036809816, - "grad_norm": 0.18466363847255707, - "learning_rate": 7.846907511394052e-10, - "loss": 0.0009, - "step": 1618 - }, - { - "epoch": 9.932515337423313, - "grad_norm": 0.20958846807479858, - "learning_rate": 6.686174120990042e-10, - "loss": 0.0012, - "step": 1619 - }, - { - "epoch": 9.938650306748466, - "grad_norm": 0.6355168223381042, - "learning_rate": 5.618283533767588e-10, - "loss": 0.007, - "step": 1620 - }, - { - "epoch": 9.94478527607362, - "grad_norm": 0.10833138227462769, - "learning_rate": 4.6432397166285e-10, - "loss": 0.0004, - "step": 1621 - }, - { - "epoch": 9.950920245398773, - "grad_norm": 0.3573082685470581, - "learning_rate": 3.7610462915699255e-10, - "loss": 0.0015, - "step": 1622 - }, - { - "epoch": 9.957055214723926, - "grad_norm": 0.21547436714172363, - "learning_rate": 2.9717065356815733e-10, - "loss": 0.0007, - "step": 1623 - }, - { - "epoch": 9.963190184049079, - "grad_norm": 0.30022335052490234, - "learning_rate": 2.2752233811262901e-10, - "loss": 0.0025, - "step": 1624 - }, - { - "epoch": 9.969325153374234, - "grad_norm": 0.1985897272825241, - "learning_rate": 1.6715994151400572e-10, - "loss": 0.0015, - "step": 1625 - }, - { - "epoch": 9.975460122699387, - "grad_norm": 0.20799656212329865, - "learning_rate": 1.160836880001459e-10, - "loss": 0.001, - "step": 1626 - }, - { - "epoch": 9.98159509202454, - "grad_norm": 0.5943353176116943, - "learning_rate": 7.429376730483385e-11, - "loss": 0.0046, - "step": 1627 - }, - { - "epoch": 9.987730061349692, - "grad_norm": 0.1584414541721344, - "learning_rate": 4.179033466500393e-11, - "loss": 0.0006, - "step": 1628 - }, - { - "epoch": 9.993865030674847, - "grad_norm": 0.2899409830570221, - "learning_rate": 1.8573510821295882e-11, - "loss": 0.0016, - "step": 1629 - }, - { - "epoch": 10.0, - "grad_norm": 0.528587281703949, - "learning_rate": 4.643382017499587e-12, - "loss": 0.0036, - "step": 1630 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 4.036761107572982e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-326/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00001-of-00007.safetensors deleted file mode 100644 index dbed6b34b0efc0791a98c4a2bcde76ae9c6fe0f2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb8be759a5d024a55de138419329f851b25ae75745b77a5991abf4b685495f20 -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00002-of-00007.safetensors deleted file mode 100644 index d20904c2b2ba1ceafd4f782d0958662a03522bc5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fba035b8729e376a5fde909c1365b6dfb91baa366ff2c09b3d8c35da6fbe4156 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00003-of-00007.safetensors deleted file mode 100644 index 11480b932a6be94cb4f66f503a4a8029a58a6ba4..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bcd702e4ddc5cf536769defcf3f991ef5c0dd8532cfe09c0cdfdc3b0c04ecbe7 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00004-of-00007.safetensors deleted file mode 100644 index d45ff7a17403c48641e2536ec0e7269b0e62149a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15b5790aa63cb5b020f59af9378273bc412489d287dc8c2b29fff0c4dbf8db17 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00005-of-00007.safetensors deleted file mode 100644 index eb288a8491721cac903015f81ee12551b03078b0..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aac55f05661a9200a69718bcf126874943570e9e3fd48bf148a51e7f641bba0b -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00006-of-00007.safetensors deleted file mode 100644 index 82e8a4e1d4948a33a72a2fc15720949a92308b36..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:acb4e0a957cadfca59595c93c24d587cfc64a76ce21fe8e7dd4252345069b603 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00007-of-00007.safetensors deleted file mode 100644 index b1a6c805effed14f8b637e8a1987fe781e5db0c9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f38ce7bcdda6e3eaf6b9f70748d7d21aa30720d75182148d188b62aadc8ebf2c -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_0.pth deleted file mode 100644 index 37ac50652a3badbfb1bdeaccb8b1934575b584eb..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_1.pth deleted file mode 100644 index 0bc3650851dae439677613c9e23a5528de47b679..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_2.pth deleted file mode 100644 index 0e00a6e8b4b743026f68d749a8cb3bdd4b746838..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_3.pth deleted file mode 100644 index 5354141d42e077c356f9ca8c6b12bd7e5e41f2af..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-326/scheduler.pt deleted file mode 100644 index 842c54907ed65d6311ef768ed7169319422e867a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6e6e256548971c8d1b9c1dd347eb98f5264be7083cbb6edae059e6979b71e9b -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-326/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-326/trainer_state.json deleted file mode 100644 index 416b883af00c613c7b5bd2aee5c64ef495b9d29a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-326/trainer_state.json +++ /dev/null @@ -1,2316 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.0, - "eval_steps": 500, - "global_step": 326, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 8.070798073082675e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-489/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00001-of-00007.safetensors deleted file mode 100644 index a1d666ef96cbfc7d0ffedb964bba9cd8997d5a5d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd3aceac2cf15d821505cb8de9a33ce7dbe633b423d73eae376f523f7762498f -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00002-of-00007.safetensors deleted file mode 100644 index 7b532a85b3d3bef57df80466a7dc27bd1010f1ee..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7bd798ad5a265051b3a107ad53cef51a62ea086295286d47b01b3d21ca4ef81 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00003-of-00007.safetensors deleted file mode 100644 index c4dcc44d3cc1d266705288d91318c7b4bdc77bbe..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a44c0b800d55379484f640b6ddd40999a96b0fcdafaac6aa40c14e939027f336 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00004-of-00007.safetensors deleted file mode 100644 index 5020e3eef5dfcec05e2a2c57781a7191e6975a29..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a7b616c13bf45345bee23fb3be3cabac34d4dd7da8449d749f674fa83237bf44 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00005-of-00007.safetensors deleted file mode 100644 index 8fe17d9428de9a416446cddbc1e05323a8e3ce25..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b47795f2a1744c6d523aee47b78c974249f5b41144907fff135f5456410f6edb -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00006-of-00007.safetensors deleted file mode 100644 index 0ea4ef551282803b4fdbde8658ba888075997a83..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07bd50e6a1c1ae6b221cd92ebd9e158215baf67aaa3f2dcbd8395f9c1274b307 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00007-of-00007.safetensors deleted file mode 100644 index 6adddbd0ec097951b40de69a6c3c73ee60f0269d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:95bf8b07e4a3008349a747770e592b09619fbfa26d1c8cc8812986ae2013bda6 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_0.pth deleted file mode 100644 index 5a7c482c30381cd512ccc35fe322d8a34fbf5207..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:308f94f9a5c24e1bad5c393d56ae7af7782600f4e791d9c6ac35b22fff2105b6 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_1.pth deleted file mode 100644 index 7b862c21b28bbd89ce6b4fb681d41be05f175599..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b056f3c23cb32dc77a2ec9e7651e0b64e4440e21f0fdf969b86bfc56a1cbdf06 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_2.pth deleted file mode 100644 index d86ce886844e0298f058d67065e5eeb27ffe7e48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3f8a05714bc528f4885a2816181652f2303b3e8150f89b56aaee6bec56aa520 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_3.pth deleted file mode 100644 index 10733f5da657367adf3f67760028644c0839660f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f755bd3c330281961e5c03af9d10ce8c1e1678619d384f6f1fd5fd7dce2ff50 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-489/scheduler.pt deleted file mode 100644 index db770aedd03f0bf0b279cc4d5af77c3aafd301a3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d79dff6f1b534f18ce078f3e3f6201edec8d20bb774122a0da035d291585012c -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-489/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-489/trainer_state.json deleted file mode 100644 index 56ba48611d00c8914e13c2391eae7ea287097af9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-489/trainer_state.json +++ /dev/null @@ -1,3457 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 3.0, - "eval_steps": 500, - "global_step": 489, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.2108401575408435e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-652/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-652/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-652/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00001-of-00007.safetensors deleted file mode 100644 index ab8ef56aabcd24876348d6dfa7d40f33a45b9fd7..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1e10c6503687d57896b2eed32e2c67cd5e7e6ca9ad7511903618aee159aa35af -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00002-of-00007.safetensors deleted file mode 100644 index 21282f7d9dcaf4a74727728b1f487ff4b538ecb7..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:041ddf97f11993b20e184706730512ef4b0b442d2a746c8df8a40a0d897911f2 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00003-of-00007.safetensors deleted file mode 100644 index 471bb0d5f16d0aaaf33d0a5da3cea6228f8f33c6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa3d32cbf6b61b946dd169baf7bb3fec131b5812f58d85dd6031df8c51af7d6e -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00004-of-00007.safetensors deleted file mode 100644 index d0bff8d22c30947e1f38b7b187dbadb8914a3316..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da90e88d866817a6a204c4b8ec5f816321564f4577a1f3a4fba5272d07e76043 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00005-of-00007.safetensors deleted file mode 100644 index 49209bdfef86df8a2a5a6272c86f86510970de32..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7af2bc90db3f2abdda59dc30028460c65817ebb9ff4e95e44f9349db08a6838 -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00006-of-00007.safetensors deleted file mode 100644 index 3b7162bacb7a006e22c54b746501754be9834fb8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98c992bc513c61087954b3280dcba7281a6bd7c853ed265d8c9a1666bf90258a -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00007-of-00007.safetensors deleted file mode 100644 index a3ed17c29f26b0acda764a0aa3c39da4347cc68c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b546475b5bfd2aaf84fa5808b3c64ab79710e6b26b8160cd8173f8b3b7ce1e8a -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-652/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_0.pth deleted file mode 100644 index f8799407442db08820f995bcf1b9158f696af19f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70cc56408014c410353d4dd58ae9b03f4be043f5f800324f66fd8e20e99b840e -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_1.pth deleted file mode 100644 index aa0c3c6aeaabc038c714a3fcc9b78d186a4cab59..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49d1438e98cc9c53a6852464635ce62e9788e61eb3646b73e33813f487c4b6ae -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_2.pth deleted file mode 100644 index 0f39416636e7990907141a415603582d33812fc9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4388add9cec90932f8ff0100d27a0574d98e1bad52ff89d44e31967d2b4fbfde -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_3.pth deleted file mode 100644 index d3775bcd497f8ad74ece6675e0bbda89fb7ee6f4..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a705d6dfaae4f2c1b4b2be6b25a6eb521ffae6fcba21cc1531e97b60037ed079 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-652/scheduler.pt deleted file mode 100644 index ec65e0e3c63696a7d02cacbf0e9893ffc2e1aab3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92b3e1d51793cce5eaa4b8e3d32e125b08101b8ed937524973c1d2225a95b89e -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-652/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-652/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-652/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-652/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-652/trainer_state.json deleted file mode 100644 index 57b2eb0325cb3a8017006ba1919865b57d94c90f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-652/trainer_state.json +++ /dev/null @@ -1,4598 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 4.0, - "eval_steps": 500, - "global_step": 652, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - }, - { - "epoch": 3.0061349693251533, - "grad_norm": 5.331607818603516, - "learning_rate": 3.969463130731183e-06, - "loss": 0.3267, - "step": 490 - }, - { - "epoch": 3.0122699386503067, - "grad_norm": 3.453650712966919, - "learning_rate": 3.965562241492401e-06, - "loss": 0.2719, - "step": 491 - }, - { - "epoch": 3.01840490797546, - "grad_norm": 3.232313632965088, - "learning_rate": 3.9616559081213335e-06, - "loss": 0.1825, - "step": 492 - }, - { - "epoch": 3.0245398773006134, - "grad_norm": 3.4860260486602783, - "learning_rate": 3.957744145128858e-06, - "loss": 0.1854, - "step": 493 - }, - { - "epoch": 3.0306748466257667, - "grad_norm": 3.4357805252075195, - "learning_rate": 3.953826967046021e-06, - "loss": 0.2224, - "step": 494 - }, - { - "epoch": 3.03680981595092, - "grad_norm": 4.557503700256348, - "learning_rate": 3.9499043884239894e-06, - "loss": 0.349, - "step": 495 - }, - { - "epoch": 3.042944785276074, - "grad_norm": 4.685214042663574, - "learning_rate": 3.945976423833987e-06, - "loss": 0.175, - "step": 496 - }, - { - "epoch": 3.049079754601227, - "grad_norm": 3.7430171966552734, - "learning_rate": 3.942043087867244e-06, - "loss": 0.2773, - "step": 497 - }, - { - "epoch": 3.0552147239263805, - "grad_norm": 3.756450653076172, - "learning_rate": 3.938104395134947e-06, - "loss": 0.4445, - "step": 498 - }, - { - "epoch": 3.061349693251534, - "grad_norm": 4.049175262451172, - "learning_rate": 3.9341603602681805e-06, - "loss": 0.3046, - "step": 499 - }, - { - "epoch": 3.067484662576687, - "grad_norm": 3.7689461708068848, - "learning_rate": 3.930210997917871e-06, - "loss": 0.2544, - "step": 500 - }, - { - "epoch": 3.0736196319018405, - "grad_norm": 4.027602195739746, - "learning_rate": 3.92625632275474e-06, - "loss": 0.3154, - "step": 501 - }, - { - "epoch": 3.079754601226994, - "grad_norm": 2.8449292182922363, - "learning_rate": 3.922296349469239e-06, - "loss": 0.2804, - "step": 502 - }, - { - "epoch": 3.085889570552147, - "grad_norm": 2.9555234909057617, - "learning_rate": 3.918331092771505e-06, - "loss": 0.2393, - "step": 503 - }, - { - "epoch": 3.0920245398773005, - "grad_norm": 2.621042013168335, - "learning_rate": 3.914360567391296e-06, - "loss": 0.1403, - "step": 504 - }, - { - "epoch": 3.098159509202454, - "grad_norm": 3.2348620891571045, - "learning_rate": 3.910384788077949e-06, - "loss": 0.1537, - "step": 505 - }, - { - "epoch": 3.104294478527607, - "grad_norm": 3.030179977416992, - "learning_rate": 3.906403769600311e-06, - "loss": 0.2921, - "step": 506 - }, - { - "epoch": 3.1104294478527605, - "grad_norm": 3.146428346633911, - "learning_rate": 3.902417526746694e-06, - "loss": 0.2036, - "step": 507 - }, - { - "epoch": 3.116564417177914, - "grad_norm": 3.6201512813568115, - "learning_rate": 3.898426074324818e-06, - "loss": 0.2655, - "step": 508 - }, - { - "epoch": 3.1226993865030677, - "grad_norm": 3.7674012184143066, - "learning_rate": 3.8944294271617524e-06, - "loss": 0.3938, - "step": 509 - }, - { - "epoch": 3.128834355828221, - "grad_norm": 4.54722785949707, - "learning_rate": 3.890427600103865e-06, - "loss": 0.3051, - "step": 510 - }, - { - "epoch": 3.1349693251533743, - "grad_norm": 4.228236675262451, - "learning_rate": 3.886420608016767e-06, - "loss": 0.3719, - "step": 511 - }, - { - "epoch": 3.1411042944785277, - "grad_norm": 4.355110168457031, - "learning_rate": 3.882408465785252e-06, - "loss": 0.1863, - "step": 512 - }, - { - "epoch": 3.147239263803681, - "grad_norm": 3.451460838317871, - "learning_rate": 3.878391188313249e-06, - "loss": 0.1479, - "step": 513 - }, - { - "epoch": 3.1533742331288344, - "grad_norm": 4.395524501800537, - "learning_rate": 3.87436879052376e-06, - "loss": 0.238, - "step": 514 - }, - { - "epoch": 3.1595092024539877, - "grad_norm": 2.940717935562134, - "learning_rate": 3.870341287358809e-06, - "loss": 0.2069, - "step": 515 - }, - { - "epoch": 3.165644171779141, - "grad_norm": 2.5817320346832275, - "learning_rate": 3.8663086937793845e-06, - "loss": 0.1189, - "step": 516 - }, - { - "epoch": 3.1717791411042944, - "grad_norm": 3.9863343238830566, - "learning_rate": 3.862271024765385e-06, - "loss": 0.3434, - "step": 517 - }, - { - "epoch": 3.1779141104294477, - "grad_norm": 3.609004259109497, - "learning_rate": 3.8582282953155626e-06, - "loss": 0.1602, - "step": 518 - }, - { - "epoch": 3.184049079754601, - "grad_norm": 3.207533121109009, - "learning_rate": 3.854180520447465e-06, - "loss": 0.3452, - "step": 519 - }, - { - "epoch": 3.190184049079755, - "grad_norm": 3.593388795852661, - "learning_rate": 3.850127715197387e-06, - "loss": 0.2832, - "step": 520 - }, - { - "epoch": 3.196319018404908, - "grad_norm": 3.409064531326294, - "learning_rate": 3.846069894620306e-06, - "loss": 0.1481, - "step": 521 - }, - { - "epoch": 3.2024539877300615, - "grad_norm": 3.461498737335205, - "learning_rate": 3.84200707378983e-06, - "loss": 0.1283, - "step": 522 - }, - { - "epoch": 3.208588957055215, - "grad_norm": 3.708467483520508, - "learning_rate": 3.8379392677981434e-06, - "loss": 0.2468, - "step": 523 - }, - { - "epoch": 3.214723926380368, - "grad_norm": 2.802381753921509, - "learning_rate": 3.833866491755947e-06, - "loss": 0.2685, - "step": 524 - }, - { - "epoch": 3.2208588957055215, - "grad_norm": 3.0787744522094727, - "learning_rate": 3.8297887607924044e-06, - "loss": 0.2595, - "step": 525 - }, - { - "epoch": 3.226993865030675, - "grad_norm": 3.3952548503875732, - "learning_rate": 3.825706090055088e-06, - "loss": 0.4099, - "step": 526 - }, - { - "epoch": 3.233128834355828, - "grad_norm": 3.3497085571289062, - "learning_rate": 3.821618494709916e-06, - "loss": 0.287, - "step": 527 - }, - { - "epoch": 3.2392638036809815, - "grad_norm": 4.050611972808838, - "learning_rate": 3.817525989941102e-06, - "loss": 0.2369, - "step": 528 - }, - { - "epoch": 3.245398773006135, - "grad_norm": 2.87642240524292, - "learning_rate": 3.8134285909510972e-06, - "loss": 0.2751, - "step": 529 - }, - { - "epoch": 3.2515337423312882, - "grad_norm": 3.821941614151001, - "learning_rate": 3.8093263129605305e-06, - "loss": 0.2363, - "step": 530 - }, - { - "epoch": 3.2576687116564416, - "grad_norm": 2.8066117763519287, - "learning_rate": 3.80521917120816e-06, - "loss": 0.094, - "step": 531 - }, - { - "epoch": 3.263803680981595, - "grad_norm": 3.849768877029419, - "learning_rate": 3.801107180950806e-06, - "loss": 0.4117, - "step": 532 - }, - { - "epoch": 3.2699386503067487, - "grad_norm": 2.4161250591278076, - "learning_rate": 3.7969903574633028e-06, - "loss": 0.1183, - "step": 533 - }, - { - "epoch": 3.276073619631902, - "grad_norm": 3.6743111610412598, - "learning_rate": 3.792868716038437e-06, - "loss": 0.2296, - "step": 534 - }, - { - "epoch": 3.2822085889570554, - "grad_norm": 4.378123760223389, - "learning_rate": 3.7887422719868937e-06, - "loss": 0.2678, - "step": 535 - }, - { - "epoch": 3.2883435582822087, - "grad_norm": 4.816481590270996, - "learning_rate": 3.784611040637198e-06, - "loss": 0.4887, - "step": 536 - }, - { - "epoch": 3.294478527607362, - "grad_norm": 3.5712430477142334, - "learning_rate": 3.7804750373356576e-06, - "loss": 0.3827, - "step": 537 - }, - { - "epoch": 3.3006134969325154, - "grad_norm": 3.6877355575561523, - "learning_rate": 3.776334277446307e-06, - "loss": 0.3233, - "step": 538 - }, - { - "epoch": 3.3067484662576687, - "grad_norm": 3.442706346511841, - "learning_rate": 3.7721887763508512e-06, - "loss": 0.1256, - "step": 539 - }, - { - "epoch": 3.312883435582822, - "grad_norm": 3.9265615940093994, - "learning_rate": 3.7680385494486053e-06, - "loss": 0.3845, - "step": 540 - }, - { - "epoch": 3.3190184049079754, - "grad_norm": 3.5030126571655273, - "learning_rate": 3.7638836121564414e-06, - "loss": 0.2905, - "step": 541 - }, - { - "epoch": 3.3251533742331287, - "grad_norm": 3.6685378551483154, - "learning_rate": 3.7597239799087283e-06, - "loss": 0.3561, - "step": 542 - }, - { - "epoch": 3.331288343558282, - "grad_norm": 3.8484046459198, - "learning_rate": 3.7555596681572736e-06, - "loss": 0.1157, - "step": 543 - }, - { - "epoch": 3.3374233128834354, - "grad_norm": 3.7977402210235596, - "learning_rate": 3.751390692371272e-06, - "loss": 0.3049, - "step": 544 - }, - { - "epoch": 3.3435582822085887, - "grad_norm": 3.4409852027893066, - "learning_rate": 3.7472170680372398e-06, - "loss": 0.1626, - "step": 545 - }, - { - "epoch": 3.3496932515337425, - "grad_norm": 3.801541328430176, - "learning_rate": 3.7430388106589632e-06, - "loss": 0.2414, - "step": 546 - }, - { - "epoch": 3.355828220858896, - "grad_norm": 4.025203704833984, - "learning_rate": 3.738855935757438e-06, - "loss": 0.3441, - "step": 547 - }, - { - "epoch": 3.361963190184049, - "grad_norm": 4.242798805236816, - "learning_rate": 3.7346684588708135e-06, - "loss": 0.5244, - "step": 548 - }, - { - "epoch": 3.3680981595092025, - "grad_norm": 3.0516819953918457, - "learning_rate": 3.7304763955543332e-06, - "loss": 0.1984, - "step": 549 - }, - { - "epoch": 3.374233128834356, - "grad_norm": 3.894667625427246, - "learning_rate": 3.726279761380279e-06, - "loss": 0.2715, - "step": 550 - }, - { - "epoch": 3.3803680981595092, - "grad_norm": 3.171208143234253, - "learning_rate": 3.72207857193791e-06, - "loss": 0.1537, - "step": 551 - }, - { - "epoch": 3.3865030674846626, - "grad_norm": 4.344860553741455, - "learning_rate": 3.7178728428334092e-06, - "loss": 0.2388, - "step": 552 - }, - { - "epoch": 3.392638036809816, - "grad_norm": 2.766317367553711, - "learning_rate": 3.7136625896898226e-06, - "loss": 0.1726, - "step": 553 - }, - { - "epoch": 3.3987730061349692, - "grad_norm": 3.550662040710449, - "learning_rate": 3.7094478281470003e-06, - "loss": 0.2942, - "step": 554 - }, - { - "epoch": 3.4049079754601226, - "grad_norm": 3.4576945304870605, - "learning_rate": 3.7052285738615412e-06, - "loss": 0.1665, - "step": 555 - }, - { - "epoch": 3.411042944785276, - "grad_norm": 4.026793003082275, - "learning_rate": 3.7010048425067317e-06, - "loss": 0.3954, - "step": 556 - }, - { - "epoch": 3.4171779141104293, - "grad_norm": 4.600133419036865, - "learning_rate": 3.696776649772492e-06, - "loss": 0.3207, - "step": 557 - }, - { - "epoch": 3.4233128834355826, - "grad_norm": 4.747331142425537, - "learning_rate": 3.692544011365312e-06, - "loss": 0.1325, - "step": 558 - }, - { - "epoch": 3.4294478527607364, - "grad_norm": 3.781464099884033, - "learning_rate": 3.6883069430081986e-06, - "loss": 0.1644, - "step": 559 - }, - { - "epoch": 3.4355828220858897, - "grad_norm": 2.905986785888672, - "learning_rate": 3.6840654604406135e-06, - "loss": 0.2469, - "step": 560 - }, - { - "epoch": 3.441717791411043, - "grad_norm": 2.3747711181640625, - "learning_rate": 3.679819579418414e-06, - "loss": 0.1146, - "step": 561 - }, - { - "epoch": 3.4478527607361964, - "grad_norm": 3.2683632373809814, - "learning_rate": 3.6755693157137995e-06, - "loss": 0.3236, - "step": 562 - }, - { - "epoch": 3.4539877300613497, - "grad_norm": 3.7750496864318848, - "learning_rate": 3.6713146851152487e-06, - "loss": 0.399, - "step": 563 - }, - { - "epoch": 3.460122699386503, - "grad_norm": 3.3912384510040283, - "learning_rate": 3.667055703427461e-06, - "loss": 0.1259, - "step": 564 - }, - { - "epoch": 3.4662576687116564, - "grad_norm": 3.0224430561065674, - "learning_rate": 3.6627923864713e-06, - "loss": 0.1835, - "step": 565 - }, - { - "epoch": 3.4723926380368098, - "grad_norm": 3.642258405685425, - "learning_rate": 3.658524750083733e-06, - "loss": 0.2763, - "step": 566 - }, - { - "epoch": 3.478527607361963, - "grad_norm": 3.409890651702881, - "learning_rate": 3.654252810117773e-06, - "loss": 0.2496, - "step": 567 - }, - { - "epoch": 3.4846625766871164, - "grad_norm": 3.0416476726531982, - "learning_rate": 3.6499765824424195e-06, - "loss": 0.1287, - "step": 568 - }, - { - "epoch": 3.4907975460122698, - "grad_norm": 3.1963987350463867, - "learning_rate": 3.6456960829425987e-06, - "loss": 0.1747, - "step": 569 - }, - { - "epoch": 3.4969325153374236, - "grad_norm": 3.198448657989502, - "learning_rate": 3.641411327519107e-06, - "loss": 0.1913, - "step": 570 - }, - { - "epoch": 3.5030674846625764, - "grad_norm": 3.7023441791534424, - "learning_rate": 3.6371223320885492e-06, - "loss": 0.3224, - "step": 571 - }, - { - "epoch": 3.5092024539877302, - "grad_norm": 4.54288387298584, - "learning_rate": 3.6328291125832803e-06, - "loss": 0.2364, - "step": 572 - }, - { - "epoch": 3.5153374233128836, - "grad_norm": 3.5064890384674072, - "learning_rate": 3.628531684951347e-06, - "loss": 0.2552, - "step": 573 - }, - { - "epoch": 3.521472392638037, - "grad_norm": 3.987583875656128, - "learning_rate": 3.6242300651564276e-06, - "loss": 0.3232, - "step": 574 - }, - { - "epoch": 3.5276073619631902, - "grad_norm": 3.179642915725708, - "learning_rate": 3.6199242691777745e-06, - "loss": 0.32, - "step": 575 - }, - { - "epoch": 3.5337423312883436, - "grad_norm": 3.3078157901763916, - "learning_rate": 3.6156143130101516e-06, - "loss": 0.2922, - "step": 576 - }, - { - "epoch": 3.539877300613497, - "grad_norm": 3.1628613471984863, - "learning_rate": 3.6113002126637765e-06, - "loss": 0.2005, - "step": 577 - }, - { - "epoch": 3.5460122699386503, - "grad_norm": 3.4515540599823, - "learning_rate": 3.606981984164263e-06, - "loss": 0.2138, - "step": 578 - }, - { - "epoch": 3.5521472392638036, - "grad_norm": 5.132473945617676, - "learning_rate": 3.6026596435525578e-06, - "loss": 0.4382, - "step": 579 - }, - { - "epoch": 3.558282208588957, - "grad_norm": 3.397614002227783, - "learning_rate": 3.5983332068848855e-06, - "loss": 0.3326, - "step": 580 - }, - { - "epoch": 3.5644171779141103, - "grad_norm": 4.79497766494751, - "learning_rate": 3.5940026902326825e-06, - "loss": 0.4748, - "step": 581 - }, - { - "epoch": 3.5705521472392636, - "grad_norm": 3.7675018310546875, - "learning_rate": 3.5896681096825446e-06, - "loss": 0.2692, - "step": 582 - }, - { - "epoch": 3.5766871165644174, - "grad_norm": 3.0637521743774414, - "learning_rate": 3.5853294813361614e-06, - "loss": 0.3658, - "step": 583 - }, - { - "epoch": 3.5828220858895703, - "grad_norm": 2.8949790000915527, - "learning_rate": 3.5809868213102623e-06, - "loss": 0.1661, - "step": 584 - }, - { - "epoch": 3.588957055214724, - "grad_norm": 3.163419246673584, - "learning_rate": 3.5766401457365485e-06, - "loss": 0.1233, - "step": 585 - }, - { - "epoch": 3.5950920245398774, - "grad_norm": 3.1787965297698975, - "learning_rate": 3.5722894707616417e-06, - "loss": 0.278, - "step": 586 - }, - { - "epoch": 3.6012269938650308, - "grad_norm": 2.9397857189178467, - "learning_rate": 3.5679348125470175e-06, - "loss": 0.1541, - "step": 587 - }, - { - "epoch": 3.607361963190184, - "grad_norm": 3.2690396308898926, - "learning_rate": 3.56357618726895e-06, - "loss": 0.1575, - "step": 588 - }, - { - "epoch": 3.6134969325153374, - "grad_norm": 5.444014072418213, - "learning_rate": 3.5592136111184483e-06, - "loss": 0.8079, - "step": 589 - }, - { - "epoch": 3.6196319018404908, - "grad_norm": 3.1688313484191895, - "learning_rate": 3.554847100301199e-06, - "loss": 0.341, - "step": 590 - }, - { - "epoch": 3.625766871165644, - "grad_norm": 2.469212532043457, - "learning_rate": 3.550476671037505e-06, - "loss": 0.1625, - "step": 591 - }, - { - "epoch": 3.6319018404907975, - "grad_norm": 3.3956527709960938, - "learning_rate": 3.546102339562223e-06, - "loss": 0.199, - "step": 592 - }, - { - "epoch": 3.638036809815951, - "grad_norm": 2.7287702560424805, - "learning_rate": 3.5417241221247078e-06, - "loss": 0.1493, - "step": 593 - }, - { - "epoch": 3.644171779141104, - "grad_norm": 3.5046865940093994, - "learning_rate": 3.5373420349887477e-06, - "loss": 0.2765, - "step": 594 - }, - { - "epoch": 3.6503067484662575, - "grad_norm": 3.121476650238037, - "learning_rate": 3.5329560944325065e-06, - "loss": 0.2833, - "step": 595 - }, - { - "epoch": 3.6564417177914113, - "grad_norm": 3.276463270187378, - "learning_rate": 3.528566316748462e-06, - "loss": 0.1237, - "step": 596 - }, - { - "epoch": 3.662576687116564, - "grad_norm": 3.382840633392334, - "learning_rate": 3.524172718243347e-06, - "loss": 0.1599, - "step": 597 - }, - { - "epoch": 3.668711656441718, - "grad_norm": 4.801311492919922, - "learning_rate": 3.5197753152380854e-06, - "loss": 0.2997, - "step": 598 - }, - { - "epoch": 3.6748466257668713, - "grad_norm": 4.117336273193359, - "learning_rate": 3.515374124067736e-06, - "loss": 0.2021, - "step": 599 - }, - { - "epoch": 3.6809815950920246, - "grad_norm": 3.611438035964966, - "learning_rate": 3.5109691610814263e-06, - "loss": 0.1726, - "step": 600 - }, - { - "epoch": 3.687116564417178, - "grad_norm": 4.5179972648620605, - "learning_rate": 3.5065604426422995e-06, - "loss": 0.1377, - "step": 601 - }, - { - "epoch": 3.6932515337423313, - "grad_norm": 3.561061382293701, - "learning_rate": 3.502147985127445e-06, - "loss": 0.1497, - "step": 602 - }, - { - "epoch": 3.6993865030674846, - "grad_norm": 3.3497917652130127, - "learning_rate": 3.4977318049278443e-06, - "loss": 0.1589, - "step": 603 - }, - { - "epoch": 3.705521472392638, - "grad_norm": 3.2725470066070557, - "learning_rate": 3.4933119184483065e-06, - "loss": 0.1364, - "step": 604 - }, - { - "epoch": 3.7116564417177913, - "grad_norm": 3.228956460952759, - "learning_rate": 3.4888883421074076e-06, - "loss": 0.177, - "step": 605 - }, - { - "epoch": 3.7177914110429446, - "grad_norm": 3.7648911476135254, - "learning_rate": 3.484461092337434e-06, - "loss": 0.122, - "step": 606 - }, - { - "epoch": 3.7239263803680984, - "grad_norm": 3.5322585105895996, - "learning_rate": 3.4800301855843137e-06, - "loss": 0.2664, - "step": 607 - }, - { - "epoch": 3.7300613496932513, - "grad_norm": 2.951073169708252, - "learning_rate": 3.4755956383075613e-06, - "loss": 0.12, - "step": 608 - }, - { - "epoch": 3.736196319018405, - "grad_norm": 3.0577664375305176, - "learning_rate": 3.471157466980214e-06, - "loss": 0.3926, - "step": 609 - }, - { - "epoch": 3.7423312883435584, - "grad_norm": 4.089846134185791, - "learning_rate": 3.466715688088772e-06, - "loss": 0.6233, - "step": 610 - }, - { - "epoch": 3.7484662576687118, - "grad_norm": 3.081340789794922, - "learning_rate": 3.462270318133136e-06, - "loss": 0.2456, - "step": 611 - }, - { - "epoch": 3.754601226993865, - "grad_norm": 3.034712553024292, - "learning_rate": 3.4578213736265474e-06, - "loss": 0.2683, - "step": 612 - }, - { - "epoch": 3.7607361963190185, - "grad_norm": 3.459815740585327, - "learning_rate": 3.4533688710955255e-06, - "loss": 0.3796, - "step": 613 - }, - { - "epoch": 3.766871165644172, - "grad_norm": 3.523737907409668, - "learning_rate": 3.448912827079805e-06, - "loss": 0.3326, - "step": 614 - }, - { - "epoch": 3.773006134969325, - "grad_norm": 3.333219289779663, - "learning_rate": 3.4444532581322793e-06, - "loss": 0.206, - "step": 615 - }, - { - "epoch": 3.7791411042944785, - "grad_norm": 3.582387685775757, - "learning_rate": 3.4399901808189327e-06, - "loss": 0.244, - "step": 616 - }, - { - "epoch": 3.785276073619632, - "grad_norm": 3.4887266159057617, - "learning_rate": 3.435523611718785e-06, - "loss": 0.1796, - "step": 617 - }, - { - "epoch": 3.791411042944785, - "grad_norm": 4.89408016204834, - "learning_rate": 3.4310535674238242e-06, - "loss": 0.188, - "step": 618 - }, - { - "epoch": 3.7975460122699385, - "grad_norm": 4.338910102844238, - "learning_rate": 3.42658006453895e-06, - "loss": 0.3039, - "step": 619 - }, - { - "epoch": 3.8036809815950923, - "grad_norm": 4.107708930969238, - "learning_rate": 3.4221031196819083e-06, - "loss": 0.3383, - "step": 620 - }, - { - "epoch": 3.809815950920245, - "grad_norm": 3.698777675628662, - "learning_rate": 3.4176227494832305e-06, - "loss": 0.1721, - "step": 621 - }, - { - "epoch": 3.815950920245399, - "grad_norm": 2.6659226417541504, - "learning_rate": 3.413138970586174e-06, - "loss": 0.2211, - "step": 622 - }, - { - "epoch": 3.8220858895705523, - "grad_norm": 3.2398436069488525, - "learning_rate": 3.4086517996466574e-06, - "loss": 0.1871, - "step": 623 - }, - { - "epoch": 3.8282208588957056, - "grad_norm": 4.9128804206848145, - "learning_rate": 3.404161253333199e-06, - "loss": 0.3874, - "step": 624 - }, - { - "epoch": 3.834355828220859, - "grad_norm": 3.508789300918579, - "learning_rate": 3.3996673483268573e-06, - "loss": 0.1739, - "step": 625 - }, - { - "epoch": 3.8404907975460123, - "grad_norm": 3.3016927242279053, - "learning_rate": 3.3951701013211665e-06, - "loss": 0.274, - "step": 626 - }, - { - "epoch": 3.8466257668711656, - "grad_norm": 3.8941333293914795, - "learning_rate": 3.3906695290220736e-06, - "loss": 0.3568, - "step": 627 - }, - { - "epoch": 3.852760736196319, - "grad_norm": 3.512354850769043, - "learning_rate": 3.3861656481478816e-06, - "loss": 0.157, - "step": 628 - }, - { - "epoch": 3.8588957055214723, - "grad_norm": 3.482649326324463, - "learning_rate": 3.3816584754291814e-06, - "loss": 0.1218, - "step": 629 - }, - { - "epoch": 3.8650306748466257, - "grad_norm": 3.1490275859832764, - "learning_rate": 3.377148027608793e-06, - "loss": 0.2234, - "step": 630 - }, - { - "epoch": 3.871165644171779, - "grad_norm": 3.2172653675079346, - "learning_rate": 3.3726343214417023e-06, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 3.8773006134969323, - "grad_norm": 4.167707443237305, - "learning_rate": 3.3681173736949984e-06, - "loss": 0.1384, - "step": 632 - }, - { - "epoch": 3.883435582822086, - "grad_norm": 3.4743919372558594, - "learning_rate": 3.3635972011478134e-06, - "loss": 0.3807, - "step": 633 - }, - { - "epoch": 3.889570552147239, - "grad_norm": 3.6892173290252686, - "learning_rate": 3.3590738205912566e-06, - "loss": 0.194, - "step": 634 - }, - { - "epoch": 3.895705521472393, - "grad_norm": 3.262967824935913, - "learning_rate": 3.354547248828356e-06, - "loss": 0.202, - "step": 635 - }, - { - "epoch": 3.901840490797546, - "grad_norm": 3.8871562480926514, - "learning_rate": 3.3500175026739916e-06, - "loss": 0.2471, - "step": 636 - }, - { - "epoch": 3.9079754601226995, - "grad_norm": 3.5097084045410156, - "learning_rate": 3.3454845989548385e-06, - "loss": 0.1112, - "step": 637 - }, - { - "epoch": 3.914110429447853, - "grad_norm": 4.163944721221924, - "learning_rate": 3.3409485545092995e-06, - "loss": 0.3368, - "step": 638 - }, - { - "epoch": 3.920245398773006, - "grad_norm": 3.6405045986175537, - "learning_rate": 3.336409386187444e-06, - "loss": 0.1863, - "step": 639 - }, - { - "epoch": 3.9263803680981595, - "grad_norm": 3.2477526664733887, - "learning_rate": 3.331867110850946e-06, - "loss": 0.1491, - "step": 640 - }, - { - "epoch": 3.932515337423313, - "grad_norm": 3.933753490447998, - "learning_rate": 3.327321745373021e-06, - "loss": 0.2484, - "step": 641 - }, - { - "epoch": 3.938650306748466, - "grad_norm": 3.2475059032440186, - "learning_rate": 3.322773306638364e-06, - "loss": 0.2126, - "step": 642 - }, - { - "epoch": 3.9447852760736195, - "grad_norm": 2.628467321395874, - "learning_rate": 3.318221811543086e-06, - "loss": 0.1649, - "step": 643 - }, - { - "epoch": 3.950920245398773, - "grad_norm": 3.2612411975860596, - "learning_rate": 3.313667276994651e-06, - "loss": 0.1442, - "step": 644 - }, - { - "epoch": 3.957055214723926, - "grad_norm": 3.8058395385742188, - "learning_rate": 3.309109719911814e-06, - "loss": 0.359, - "step": 645 - }, - { - "epoch": 3.96319018404908, - "grad_norm": 3.3450071811676025, - "learning_rate": 3.304549157224558e-06, - "loss": 0.4042, - "step": 646 - }, - { - "epoch": 3.969325153374233, - "grad_norm": 3.079601287841797, - "learning_rate": 3.299985605874031e-06, - "loss": 0.1699, - "step": 647 - }, - { - "epoch": 3.9754601226993866, - "grad_norm": 3.8963980674743652, - "learning_rate": 3.295419082812483e-06, - "loss": 0.1888, - "step": 648 - }, - { - "epoch": 3.98159509202454, - "grad_norm": 3.307405948638916, - "learning_rate": 3.2908496050032024e-06, - "loss": 0.2824, - "step": 649 - }, - { - "epoch": 3.9877300613496933, - "grad_norm": 3.227478265762329, - "learning_rate": 3.2862771894204544e-06, - "loss": 0.3038, - "step": 650 - }, - { - "epoch": 3.9938650306748467, - "grad_norm": 4.046506881713867, - "learning_rate": 3.2817018530494164e-06, - "loss": 0.3266, - "step": 651 - }, - { - "epoch": 4.0, - "grad_norm": 7.775874614715576, - "learning_rate": 3.277123612886116e-06, - "loss": 0.2998, - "step": 652 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.6158788192252723e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-815/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-815/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-815/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00001-of-00007.safetensors deleted file mode 100644 index 766ba3dc95e00ed66acc562934c3d055c810e934..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0bbb447c161dca4006f58e9e449167832acb637e5562306f21b3b6fe1d6ed34e -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00002-of-00007.safetensors deleted file mode 100644 index e7e5224d4d0bf307fdb631f894f731a30636c2e3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f570f2b24536ba0e9289e17b88c0da61c997f9fa2cfa070f7d69954f785c9abb -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00003-of-00007.safetensors deleted file mode 100644 index 06b047e2069983cc09401e1f0a1591c0e5604c21..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ecca404892f96aa1e7755541013b7e974b6b0a8aefd70b1a3dad4b62d69588ef -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00004-of-00007.safetensors deleted file mode 100644 index 5f183da242413d311dc2b103f6d4a09ba55031f9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ae64ab4e22fa77a7ffb672cadd1813cb45f0ce492a91a534331dd80a629f927 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00005-of-00007.safetensors deleted file mode 100644 index fd27f38541890c9367832f41c228db2668c1dfe2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0f5c899a27289252c122f88dbfc55380b5a09de9f6f4e3688380be8b09f08af -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00006-of-00007.safetensors deleted file mode 100644 index e8ba1601d6e84592aa6c350f3e45cd16a9126e60..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cdfce2807c5b5091ddf4a9a987e5b555d18cc54f2a43d8b97c9e2aea940e9bf1 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00007-of-00007.safetensors deleted file mode 100644 index 5bdbd7b6aebdc0a7027afa372ebc3fc38fd94483..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7693de5c5087455290b3d8e0b892205be443afb7fc508bc48a455f2a06182425 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-815/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_0.pth deleted file mode 100644 index 3fb9a88bbbee1d828823dc0792895d385b4be47e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c5e18f922d0af74d820247ae97bee506ab412554a58345ddf2558abc94ee3e3 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_1.pth deleted file mode 100644 index cc3d4a3c6ff4b588e0b24552f5cc78610d1a3f42..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a2dcca6d9741f46592359768ea2212b9321da6408d1fd7d3a80b017bf37f434 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_2.pth deleted file mode 100644 index 0ea7e83be3a9fc39999b7084bcf14ba0f491317b..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69420ece2c255923c5cbb3c6c9c4a6b9cb38fb57e5d3033c8b7d436a1faf6f13 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_3.pth deleted file mode 100644 index 88e70a1e21ef6d40a7016a6221703385b6c1cdc6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66f278b40a1e23b88a657c4e5d03afa8dbbbe14dfeb16f6b4beedaece6cdd0b9 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-815/scheduler.pt deleted file mode 100644 index 5fc60585ab08e39dd8044a298016c34eda99d3be..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd310910282e00e3284cdaf0e5f0ab6462ca8221d64e6d8398373cf54e7ebb7b -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-815/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-815/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-815/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-815/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-815/trainer_state.json deleted file mode 100644 index 042a4e7653942cf0d71cc476332cba486bd4ab17..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-815/trainer_state.json +++ /dev/null @@ -1,5739 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 5.0, - "eval_steps": 500, - "global_step": 815, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - }, - { - "epoch": 3.0061349693251533, - "grad_norm": 5.331607818603516, - "learning_rate": 3.969463130731183e-06, - "loss": 0.3267, - "step": 490 - }, - { - "epoch": 3.0122699386503067, - "grad_norm": 3.453650712966919, - "learning_rate": 3.965562241492401e-06, - "loss": 0.2719, - "step": 491 - }, - { - "epoch": 3.01840490797546, - "grad_norm": 3.232313632965088, - "learning_rate": 3.9616559081213335e-06, - "loss": 0.1825, - "step": 492 - }, - { - "epoch": 3.0245398773006134, - "grad_norm": 3.4860260486602783, - "learning_rate": 3.957744145128858e-06, - "loss": 0.1854, - "step": 493 - }, - { - "epoch": 3.0306748466257667, - "grad_norm": 3.4357805252075195, - "learning_rate": 3.953826967046021e-06, - "loss": 0.2224, - "step": 494 - }, - { - "epoch": 3.03680981595092, - "grad_norm": 4.557503700256348, - "learning_rate": 3.9499043884239894e-06, - "loss": 0.349, - "step": 495 - }, - { - "epoch": 3.042944785276074, - "grad_norm": 4.685214042663574, - "learning_rate": 3.945976423833987e-06, - "loss": 0.175, - "step": 496 - }, - { - "epoch": 3.049079754601227, - "grad_norm": 3.7430171966552734, - "learning_rate": 3.942043087867244e-06, - "loss": 0.2773, - "step": 497 - }, - { - "epoch": 3.0552147239263805, - "grad_norm": 3.756450653076172, - "learning_rate": 3.938104395134947e-06, - "loss": 0.4445, - "step": 498 - }, - { - "epoch": 3.061349693251534, - "grad_norm": 4.049175262451172, - "learning_rate": 3.9341603602681805e-06, - "loss": 0.3046, - "step": 499 - }, - { - "epoch": 3.067484662576687, - "grad_norm": 3.7689461708068848, - "learning_rate": 3.930210997917871e-06, - "loss": 0.2544, - "step": 500 - }, - { - "epoch": 3.0736196319018405, - "grad_norm": 4.027602195739746, - "learning_rate": 3.92625632275474e-06, - "loss": 0.3154, - "step": 501 - }, - { - "epoch": 3.079754601226994, - "grad_norm": 2.8449292182922363, - "learning_rate": 3.922296349469239e-06, - "loss": 0.2804, - "step": 502 - }, - { - "epoch": 3.085889570552147, - "grad_norm": 2.9555234909057617, - "learning_rate": 3.918331092771505e-06, - "loss": 0.2393, - "step": 503 - }, - { - "epoch": 3.0920245398773005, - "grad_norm": 2.621042013168335, - "learning_rate": 3.914360567391296e-06, - "loss": 0.1403, - "step": 504 - }, - { - "epoch": 3.098159509202454, - "grad_norm": 3.2348620891571045, - "learning_rate": 3.910384788077949e-06, - "loss": 0.1537, - "step": 505 - }, - { - "epoch": 3.104294478527607, - "grad_norm": 3.030179977416992, - "learning_rate": 3.906403769600311e-06, - "loss": 0.2921, - "step": 506 - }, - { - "epoch": 3.1104294478527605, - "grad_norm": 3.146428346633911, - "learning_rate": 3.902417526746694e-06, - "loss": 0.2036, - "step": 507 - }, - { - "epoch": 3.116564417177914, - "grad_norm": 3.6201512813568115, - "learning_rate": 3.898426074324818e-06, - "loss": 0.2655, - "step": 508 - }, - { - "epoch": 3.1226993865030677, - "grad_norm": 3.7674012184143066, - "learning_rate": 3.8944294271617524e-06, - "loss": 0.3938, - "step": 509 - }, - { - "epoch": 3.128834355828221, - "grad_norm": 4.54722785949707, - "learning_rate": 3.890427600103865e-06, - "loss": 0.3051, - "step": 510 - }, - { - "epoch": 3.1349693251533743, - "grad_norm": 4.228236675262451, - "learning_rate": 3.886420608016767e-06, - "loss": 0.3719, - "step": 511 - }, - { - "epoch": 3.1411042944785277, - "grad_norm": 4.355110168457031, - "learning_rate": 3.882408465785252e-06, - "loss": 0.1863, - "step": 512 - }, - { - "epoch": 3.147239263803681, - "grad_norm": 3.451460838317871, - "learning_rate": 3.878391188313249e-06, - "loss": 0.1479, - "step": 513 - }, - { - "epoch": 3.1533742331288344, - "grad_norm": 4.395524501800537, - "learning_rate": 3.87436879052376e-06, - "loss": 0.238, - "step": 514 - }, - { - "epoch": 3.1595092024539877, - "grad_norm": 2.940717935562134, - "learning_rate": 3.870341287358809e-06, - "loss": 0.2069, - "step": 515 - }, - { - "epoch": 3.165644171779141, - "grad_norm": 2.5817320346832275, - "learning_rate": 3.8663086937793845e-06, - "loss": 0.1189, - "step": 516 - }, - { - "epoch": 3.1717791411042944, - "grad_norm": 3.9863343238830566, - "learning_rate": 3.862271024765385e-06, - "loss": 0.3434, - "step": 517 - }, - { - "epoch": 3.1779141104294477, - "grad_norm": 3.609004259109497, - "learning_rate": 3.8582282953155626e-06, - "loss": 0.1602, - "step": 518 - }, - { - "epoch": 3.184049079754601, - "grad_norm": 3.207533121109009, - "learning_rate": 3.854180520447465e-06, - "loss": 0.3452, - "step": 519 - }, - { - "epoch": 3.190184049079755, - "grad_norm": 3.593388795852661, - "learning_rate": 3.850127715197387e-06, - "loss": 0.2832, - "step": 520 - }, - { - "epoch": 3.196319018404908, - "grad_norm": 3.409064531326294, - "learning_rate": 3.846069894620306e-06, - "loss": 0.1481, - "step": 521 - }, - { - "epoch": 3.2024539877300615, - "grad_norm": 3.461498737335205, - "learning_rate": 3.84200707378983e-06, - "loss": 0.1283, - "step": 522 - }, - { - "epoch": 3.208588957055215, - "grad_norm": 3.708467483520508, - "learning_rate": 3.8379392677981434e-06, - "loss": 0.2468, - "step": 523 - }, - { - "epoch": 3.214723926380368, - "grad_norm": 2.802381753921509, - "learning_rate": 3.833866491755947e-06, - "loss": 0.2685, - "step": 524 - }, - { - "epoch": 3.2208588957055215, - "grad_norm": 3.0787744522094727, - "learning_rate": 3.8297887607924044e-06, - "loss": 0.2595, - "step": 525 - }, - { - "epoch": 3.226993865030675, - "grad_norm": 3.3952548503875732, - "learning_rate": 3.825706090055088e-06, - "loss": 0.4099, - "step": 526 - }, - { - "epoch": 3.233128834355828, - "grad_norm": 3.3497085571289062, - "learning_rate": 3.821618494709916e-06, - "loss": 0.287, - "step": 527 - }, - { - "epoch": 3.2392638036809815, - "grad_norm": 4.050611972808838, - "learning_rate": 3.817525989941102e-06, - "loss": 0.2369, - "step": 528 - }, - { - "epoch": 3.245398773006135, - "grad_norm": 2.87642240524292, - "learning_rate": 3.8134285909510972e-06, - "loss": 0.2751, - "step": 529 - }, - { - "epoch": 3.2515337423312882, - "grad_norm": 3.821941614151001, - "learning_rate": 3.8093263129605305e-06, - "loss": 0.2363, - "step": 530 - }, - { - "epoch": 3.2576687116564416, - "grad_norm": 2.8066117763519287, - "learning_rate": 3.80521917120816e-06, - "loss": 0.094, - "step": 531 - }, - { - "epoch": 3.263803680981595, - "grad_norm": 3.849768877029419, - "learning_rate": 3.801107180950806e-06, - "loss": 0.4117, - "step": 532 - }, - { - "epoch": 3.2699386503067487, - "grad_norm": 2.4161250591278076, - "learning_rate": 3.7969903574633028e-06, - "loss": 0.1183, - "step": 533 - }, - { - "epoch": 3.276073619631902, - "grad_norm": 3.6743111610412598, - "learning_rate": 3.792868716038437e-06, - "loss": 0.2296, - "step": 534 - }, - { - "epoch": 3.2822085889570554, - "grad_norm": 4.378123760223389, - "learning_rate": 3.7887422719868937e-06, - "loss": 0.2678, - "step": 535 - }, - { - "epoch": 3.2883435582822087, - "grad_norm": 4.816481590270996, - "learning_rate": 3.784611040637198e-06, - "loss": 0.4887, - "step": 536 - }, - { - "epoch": 3.294478527607362, - "grad_norm": 3.5712430477142334, - "learning_rate": 3.7804750373356576e-06, - "loss": 0.3827, - "step": 537 - }, - { - "epoch": 3.3006134969325154, - "grad_norm": 3.6877355575561523, - "learning_rate": 3.776334277446307e-06, - "loss": 0.3233, - "step": 538 - }, - { - "epoch": 3.3067484662576687, - "grad_norm": 3.442706346511841, - "learning_rate": 3.7721887763508512e-06, - "loss": 0.1256, - "step": 539 - }, - { - "epoch": 3.312883435582822, - "grad_norm": 3.9265615940093994, - "learning_rate": 3.7680385494486053e-06, - "loss": 0.3845, - "step": 540 - }, - { - "epoch": 3.3190184049079754, - "grad_norm": 3.5030126571655273, - "learning_rate": 3.7638836121564414e-06, - "loss": 0.2905, - "step": 541 - }, - { - "epoch": 3.3251533742331287, - "grad_norm": 3.6685378551483154, - "learning_rate": 3.7597239799087283e-06, - "loss": 0.3561, - "step": 542 - }, - { - "epoch": 3.331288343558282, - "grad_norm": 3.8484046459198, - "learning_rate": 3.7555596681572736e-06, - "loss": 0.1157, - "step": 543 - }, - { - "epoch": 3.3374233128834354, - "grad_norm": 3.7977402210235596, - "learning_rate": 3.751390692371272e-06, - "loss": 0.3049, - "step": 544 - }, - { - "epoch": 3.3435582822085887, - "grad_norm": 3.4409852027893066, - "learning_rate": 3.7472170680372398e-06, - "loss": 0.1626, - "step": 545 - }, - { - "epoch": 3.3496932515337425, - "grad_norm": 3.801541328430176, - "learning_rate": 3.7430388106589632e-06, - "loss": 0.2414, - "step": 546 - }, - { - "epoch": 3.355828220858896, - "grad_norm": 4.025203704833984, - "learning_rate": 3.738855935757438e-06, - "loss": 0.3441, - "step": 547 - }, - { - "epoch": 3.361963190184049, - "grad_norm": 4.242798805236816, - "learning_rate": 3.7346684588708135e-06, - "loss": 0.5244, - "step": 548 - }, - { - "epoch": 3.3680981595092025, - "grad_norm": 3.0516819953918457, - "learning_rate": 3.7304763955543332e-06, - "loss": 0.1984, - "step": 549 - }, - { - "epoch": 3.374233128834356, - "grad_norm": 3.894667625427246, - "learning_rate": 3.726279761380279e-06, - "loss": 0.2715, - "step": 550 - }, - { - "epoch": 3.3803680981595092, - "grad_norm": 3.171208143234253, - "learning_rate": 3.72207857193791e-06, - "loss": 0.1537, - "step": 551 - }, - { - "epoch": 3.3865030674846626, - "grad_norm": 4.344860553741455, - "learning_rate": 3.7178728428334092e-06, - "loss": 0.2388, - "step": 552 - }, - { - "epoch": 3.392638036809816, - "grad_norm": 2.766317367553711, - "learning_rate": 3.7136625896898226e-06, - "loss": 0.1726, - "step": 553 - }, - { - "epoch": 3.3987730061349692, - "grad_norm": 3.550662040710449, - "learning_rate": 3.7094478281470003e-06, - "loss": 0.2942, - "step": 554 - }, - { - "epoch": 3.4049079754601226, - "grad_norm": 3.4576945304870605, - "learning_rate": 3.7052285738615412e-06, - "loss": 0.1665, - "step": 555 - }, - { - "epoch": 3.411042944785276, - "grad_norm": 4.026793003082275, - "learning_rate": 3.7010048425067317e-06, - "loss": 0.3954, - "step": 556 - }, - { - "epoch": 3.4171779141104293, - "grad_norm": 4.600133419036865, - "learning_rate": 3.696776649772492e-06, - "loss": 0.3207, - "step": 557 - }, - { - "epoch": 3.4233128834355826, - "grad_norm": 4.747331142425537, - "learning_rate": 3.692544011365312e-06, - "loss": 0.1325, - "step": 558 - }, - { - "epoch": 3.4294478527607364, - "grad_norm": 3.781464099884033, - "learning_rate": 3.6883069430081986e-06, - "loss": 0.1644, - "step": 559 - }, - { - "epoch": 3.4355828220858897, - "grad_norm": 2.905986785888672, - "learning_rate": 3.6840654604406135e-06, - "loss": 0.2469, - "step": 560 - }, - { - "epoch": 3.441717791411043, - "grad_norm": 2.3747711181640625, - "learning_rate": 3.679819579418414e-06, - "loss": 0.1146, - "step": 561 - }, - { - "epoch": 3.4478527607361964, - "grad_norm": 3.2683632373809814, - "learning_rate": 3.6755693157137995e-06, - "loss": 0.3236, - "step": 562 - }, - { - "epoch": 3.4539877300613497, - "grad_norm": 3.7750496864318848, - "learning_rate": 3.6713146851152487e-06, - "loss": 0.399, - "step": 563 - }, - { - "epoch": 3.460122699386503, - "grad_norm": 3.3912384510040283, - "learning_rate": 3.667055703427461e-06, - "loss": 0.1259, - "step": 564 - }, - { - "epoch": 3.4662576687116564, - "grad_norm": 3.0224430561065674, - "learning_rate": 3.6627923864713e-06, - "loss": 0.1835, - "step": 565 - }, - { - "epoch": 3.4723926380368098, - "grad_norm": 3.642258405685425, - "learning_rate": 3.658524750083733e-06, - "loss": 0.2763, - "step": 566 - }, - { - "epoch": 3.478527607361963, - "grad_norm": 3.409890651702881, - "learning_rate": 3.654252810117773e-06, - "loss": 0.2496, - "step": 567 - }, - { - "epoch": 3.4846625766871164, - "grad_norm": 3.0416476726531982, - "learning_rate": 3.6499765824424195e-06, - "loss": 0.1287, - "step": 568 - }, - { - "epoch": 3.4907975460122698, - "grad_norm": 3.1963987350463867, - "learning_rate": 3.6456960829425987e-06, - "loss": 0.1747, - "step": 569 - }, - { - "epoch": 3.4969325153374236, - "grad_norm": 3.198448657989502, - "learning_rate": 3.641411327519107e-06, - "loss": 0.1913, - "step": 570 - }, - { - "epoch": 3.5030674846625764, - "grad_norm": 3.7023441791534424, - "learning_rate": 3.6371223320885492e-06, - "loss": 0.3224, - "step": 571 - }, - { - "epoch": 3.5092024539877302, - "grad_norm": 4.54288387298584, - "learning_rate": 3.6328291125832803e-06, - "loss": 0.2364, - "step": 572 - }, - { - "epoch": 3.5153374233128836, - "grad_norm": 3.5064890384674072, - "learning_rate": 3.628531684951347e-06, - "loss": 0.2552, - "step": 573 - }, - { - "epoch": 3.521472392638037, - "grad_norm": 3.987583875656128, - "learning_rate": 3.6242300651564276e-06, - "loss": 0.3232, - "step": 574 - }, - { - "epoch": 3.5276073619631902, - "grad_norm": 3.179642915725708, - "learning_rate": 3.6199242691777745e-06, - "loss": 0.32, - "step": 575 - }, - { - "epoch": 3.5337423312883436, - "grad_norm": 3.3078157901763916, - "learning_rate": 3.6156143130101516e-06, - "loss": 0.2922, - "step": 576 - }, - { - "epoch": 3.539877300613497, - "grad_norm": 3.1628613471984863, - "learning_rate": 3.6113002126637765e-06, - "loss": 0.2005, - "step": 577 - }, - { - "epoch": 3.5460122699386503, - "grad_norm": 3.4515540599823, - "learning_rate": 3.606981984164263e-06, - "loss": 0.2138, - "step": 578 - }, - { - "epoch": 3.5521472392638036, - "grad_norm": 5.132473945617676, - "learning_rate": 3.6026596435525578e-06, - "loss": 0.4382, - "step": 579 - }, - { - "epoch": 3.558282208588957, - "grad_norm": 3.397614002227783, - "learning_rate": 3.5983332068848855e-06, - "loss": 0.3326, - "step": 580 - }, - { - "epoch": 3.5644171779141103, - "grad_norm": 4.79497766494751, - "learning_rate": 3.5940026902326825e-06, - "loss": 0.4748, - "step": 581 - }, - { - "epoch": 3.5705521472392636, - "grad_norm": 3.7675018310546875, - "learning_rate": 3.5896681096825446e-06, - "loss": 0.2692, - "step": 582 - }, - { - "epoch": 3.5766871165644174, - "grad_norm": 3.0637521743774414, - "learning_rate": 3.5853294813361614e-06, - "loss": 0.3658, - "step": 583 - }, - { - "epoch": 3.5828220858895703, - "grad_norm": 2.8949790000915527, - "learning_rate": 3.5809868213102623e-06, - "loss": 0.1661, - "step": 584 - }, - { - "epoch": 3.588957055214724, - "grad_norm": 3.163419246673584, - "learning_rate": 3.5766401457365485e-06, - "loss": 0.1233, - "step": 585 - }, - { - "epoch": 3.5950920245398774, - "grad_norm": 3.1787965297698975, - "learning_rate": 3.5722894707616417e-06, - "loss": 0.278, - "step": 586 - }, - { - "epoch": 3.6012269938650308, - "grad_norm": 2.9397857189178467, - "learning_rate": 3.5679348125470175e-06, - "loss": 0.1541, - "step": 587 - }, - { - "epoch": 3.607361963190184, - "grad_norm": 3.2690396308898926, - "learning_rate": 3.56357618726895e-06, - "loss": 0.1575, - "step": 588 - }, - { - "epoch": 3.6134969325153374, - "grad_norm": 5.444014072418213, - "learning_rate": 3.5592136111184483e-06, - "loss": 0.8079, - "step": 589 - }, - { - "epoch": 3.6196319018404908, - "grad_norm": 3.1688313484191895, - "learning_rate": 3.554847100301199e-06, - "loss": 0.341, - "step": 590 - }, - { - "epoch": 3.625766871165644, - "grad_norm": 2.469212532043457, - "learning_rate": 3.550476671037505e-06, - "loss": 0.1625, - "step": 591 - }, - { - "epoch": 3.6319018404907975, - "grad_norm": 3.3956527709960938, - "learning_rate": 3.546102339562223e-06, - "loss": 0.199, - "step": 592 - }, - { - "epoch": 3.638036809815951, - "grad_norm": 2.7287702560424805, - "learning_rate": 3.5417241221247078e-06, - "loss": 0.1493, - "step": 593 - }, - { - "epoch": 3.644171779141104, - "grad_norm": 3.5046865940093994, - "learning_rate": 3.5373420349887477e-06, - "loss": 0.2765, - "step": 594 - }, - { - "epoch": 3.6503067484662575, - "grad_norm": 3.121476650238037, - "learning_rate": 3.5329560944325065e-06, - "loss": 0.2833, - "step": 595 - }, - { - "epoch": 3.6564417177914113, - "grad_norm": 3.276463270187378, - "learning_rate": 3.528566316748462e-06, - "loss": 0.1237, - "step": 596 - }, - { - "epoch": 3.662576687116564, - "grad_norm": 3.382840633392334, - "learning_rate": 3.524172718243347e-06, - "loss": 0.1599, - "step": 597 - }, - { - "epoch": 3.668711656441718, - "grad_norm": 4.801311492919922, - "learning_rate": 3.5197753152380854e-06, - "loss": 0.2997, - "step": 598 - }, - { - "epoch": 3.6748466257668713, - "grad_norm": 4.117336273193359, - "learning_rate": 3.515374124067736e-06, - "loss": 0.2021, - "step": 599 - }, - { - "epoch": 3.6809815950920246, - "grad_norm": 3.611438035964966, - "learning_rate": 3.5109691610814263e-06, - "loss": 0.1726, - "step": 600 - }, - { - "epoch": 3.687116564417178, - "grad_norm": 4.5179972648620605, - "learning_rate": 3.5065604426422995e-06, - "loss": 0.1377, - "step": 601 - }, - { - "epoch": 3.6932515337423313, - "grad_norm": 3.561061382293701, - "learning_rate": 3.502147985127445e-06, - "loss": 0.1497, - "step": 602 - }, - { - "epoch": 3.6993865030674846, - "grad_norm": 3.3497917652130127, - "learning_rate": 3.4977318049278443e-06, - "loss": 0.1589, - "step": 603 - }, - { - "epoch": 3.705521472392638, - "grad_norm": 3.2725470066070557, - "learning_rate": 3.4933119184483065e-06, - "loss": 0.1364, - "step": 604 - }, - { - "epoch": 3.7116564417177913, - "grad_norm": 3.228956460952759, - "learning_rate": 3.4888883421074076e-06, - "loss": 0.177, - "step": 605 - }, - { - "epoch": 3.7177914110429446, - "grad_norm": 3.7648911476135254, - "learning_rate": 3.484461092337434e-06, - "loss": 0.122, - "step": 606 - }, - { - "epoch": 3.7239263803680984, - "grad_norm": 3.5322585105895996, - "learning_rate": 3.4800301855843137e-06, - "loss": 0.2664, - "step": 607 - }, - { - "epoch": 3.7300613496932513, - "grad_norm": 2.951073169708252, - "learning_rate": 3.4755956383075613e-06, - "loss": 0.12, - "step": 608 - }, - { - "epoch": 3.736196319018405, - "grad_norm": 3.0577664375305176, - "learning_rate": 3.471157466980214e-06, - "loss": 0.3926, - "step": 609 - }, - { - "epoch": 3.7423312883435584, - "grad_norm": 4.089846134185791, - "learning_rate": 3.466715688088772e-06, - "loss": 0.6233, - "step": 610 - }, - { - "epoch": 3.7484662576687118, - "grad_norm": 3.081340789794922, - "learning_rate": 3.462270318133136e-06, - "loss": 0.2456, - "step": 611 - }, - { - "epoch": 3.754601226993865, - "grad_norm": 3.034712553024292, - "learning_rate": 3.4578213736265474e-06, - "loss": 0.2683, - "step": 612 - }, - { - "epoch": 3.7607361963190185, - "grad_norm": 3.459815740585327, - "learning_rate": 3.4533688710955255e-06, - "loss": 0.3796, - "step": 613 - }, - { - "epoch": 3.766871165644172, - "grad_norm": 3.523737907409668, - "learning_rate": 3.448912827079805e-06, - "loss": 0.3326, - "step": 614 - }, - { - "epoch": 3.773006134969325, - "grad_norm": 3.333219289779663, - "learning_rate": 3.4444532581322793e-06, - "loss": 0.206, - "step": 615 - }, - { - "epoch": 3.7791411042944785, - "grad_norm": 3.582387685775757, - "learning_rate": 3.4399901808189327e-06, - "loss": 0.244, - "step": 616 - }, - { - "epoch": 3.785276073619632, - "grad_norm": 3.4887266159057617, - "learning_rate": 3.435523611718785e-06, - "loss": 0.1796, - "step": 617 - }, - { - "epoch": 3.791411042944785, - "grad_norm": 4.89408016204834, - "learning_rate": 3.4310535674238242e-06, - "loss": 0.188, - "step": 618 - }, - { - "epoch": 3.7975460122699385, - "grad_norm": 4.338910102844238, - "learning_rate": 3.42658006453895e-06, - "loss": 0.3039, - "step": 619 - }, - { - "epoch": 3.8036809815950923, - "grad_norm": 4.107708930969238, - "learning_rate": 3.4221031196819083e-06, - "loss": 0.3383, - "step": 620 - }, - { - "epoch": 3.809815950920245, - "grad_norm": 3.698777675628662, - "learning_rate": 3.4176227494832305e-06, - "loss": 0.1721, - "step": 621 - }, - { - "epoch": 3.815950920245399, - "grad_norm": 2.6659226417541504, - "learning_rate": 3.413138970586174e-06, - "loss": 0.2211, - "step": 622 - }, - { - "epoch": 3.8220858895705523, - "grad_norm": 3.2398436069488525, - "learning_rate": 3.4086517996466574e-06, - "loss": 0.1871, - "step": 623 - }, - { - "epoch": 3.8282208588957056, - "grad_norm": 4.9128804206848145, - "learning_rate": 3.404161253333199e-06, - "loss": 0.3874, - "step": 624 - }, - { - "epoch": 3.834355828220859, - "grad_norm": 3.508789300918579, - "learning_rate": 3.3996673483268573e-06, - "loss": 0.1739, - "step": 625 - }, - { - "epoch": 3.8404907975460123, - "grad_norm": 3.3016927242279053, - "learning_rate": 3.3951701013211665e-06, - "loss": 0.274, - "step": 626 - }, - { - "epoch": 3.8466257668711656, - "grad_norm": 3.8941333293914795, - "learning_rate": 3.3906695290220736e-06, - "loss": 0.3568, - "step": 627 - }, - { - "epoch": 3.852760736196319, - "grad_norm": 3.512354850769043, - "learning_rate": 3.3861656481478816e-06, - "loss": 0.157, - "step": 628 - }, - { - "epoch": 3.8588957055214723, - "grad_norm": 3.482649326324463, - "learning_rate": 3.3816584754291814e-06, - "loss": 0.1218, - "step": 629 - }, - { - "epoch": 3.8650306748466257, - "grad_norm": 3.1490275859832764, - "learning_rate": 3.377148027608793e-06, - "loss": 0.2234, - "step": 630 - }, - { - "epoch": 3.871165644171779, - "grad_norm": 3.2172653675079346, - "learning_rate": 3.3726343214417023e-06, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 3.8773006134969323, - "grad_norm": 4.167707443237305, - "learning_rate": 3.3681173736949984e-06, - "loss": 0.1384, - "step": 632 - }, - { - "epoch": 3.883435582822086, - "grad_norm": 3.4743919372558594, - "learning_rate": 3.3635972011478134e-06, - "loss": 0.3807, - "step": 633 - }, - { - "epoch": 3.889570552147239, - "grad_norm": 3.6892173290252686, - "learning_rate": 3.3590738205912566e-06, - "loss": 0.194, - "step": 634 - }, - { - "epoch": 3.895705521472393, - "grad_norm": 3.262967824935913, - "learning_rate": 3.354547248828356e-06, - "loss": 0.202, - "step": 635 - }, - { - "epoch": 3.901840490797546, - "grad_norm": 3.8871562480926514, - "learning_rate": 3.3500175026739916e-06, - "loss": 0.2471, - "step": 636 - }, - { - "epoch": 3.9079754601226995, - "grad_norm": 3.5097084045410156, - "learning_rate": 3.3454845989548385e-06, - "loss": 0.1112, - "step": 637 - }, - { - "epoch": 3.914110429447853, - "grad_norm": 4.163944721221924, - "learning_rate": 3.3409485545092995e-06, - "loss": 0.3368, - "step": 638 - }, - { - "epoch": 3.920245398773006, - "grad_norm": 3.6405045986175537, - "learning_rate": 3.336409386187444e-06, - "loss": 0.1863, - "step": 639 - }, - { - "epoch": 3.9263803680981595, - "grad_norm": 3.2477526664733887, - "learning_rate": 3.331867110850946e-06, - "loss": 0.1491, - "step": 640 - }, - { - "epoch": 3.932515337423313, - "grad_norm": 3.933753490447998, - "learning_rate": 3.327321745373021e-06, - "loss": 0.2484, - "step": 641 - }, - { - "epoch": 3.938650306748466, - "grad_norm": 3.2475059032440186, - "learning_rate": 3.322773306638364e-06, - "loss": 0.2126, - "step": 642 - }, - { - "epoch": 3.9447852760736195, - "grad_norm": 2.628467321395874, - "learning_rate": 3.318221811543086e-06, - "loss": 0.1649, - "step": 643 - }, - { - "epoch": 3.950920245398773, - "grad_norm": 3.2612411975860596, - "learning_rate": 3.313667276994651e-06, - "loss": 0.1442, - "step": 644 - }, - { - "epoch": 3.957055214723926, - "grad_norm": 3.8058395385742188, - "learning_rate": 3.309109719911814e-06, - "loss": 0.359, - "step": 645 - }, - { - "epoch": 3.96319018404908, - "grad_norm": 3.3450071811676025, - "learning_rate": 3.304549157224558e-06, - "loss": 0.4042, - "step": 646 - }, - { - "epoch": 3.969325153374233, - "grad_norm": 3.079601287841797, - "learning_rate": 3.299985605874031e-06, - "loss": 0.1699, - "step": 647 - }, - { - "epoch": 3.9754601226993866, - "grad_norm": 3.8963980674743652, - "learning_rate": 3.295419082812483e-06, - "loss": 0.1888, - "step": 648 - }, - { - "epoch": 3.98159509202454, - "grad_norm": 3.307405948638916, - "learning_rate": 3.2908496050032024e-06, - "loss": 0.2824, - "step": 649 - }, - { - "epoch": 3.9877300613496933, - "grad_norm": 3.227478265762329, - "learning_rate": 3.2862771894204544e-06, - "loss": 0.3038, - "step": 650 - }, - { - "epoch": 3.9938650306748467, - "grad_norm": 4.046506881713867, - "learning_rate": 3.2817018530494164e-06, - "loss": 0.3266, - "step": 651 - }, - { - "epoch": 4.0, - "grad_norm": 7.775874614715576, - "learning_rate": 3.277123612886116e-06, - "loss": 0.2998, - "step": 652 - }, - { - "epoch": 4.006134969325154, - "grad_norm": 3.146462917327881, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2764, - "step": 653 - }, - { - "epoch": 4.012269938650307, - "grad_norm": 3.0539863109588623, - "learning_rate": 3.2679584892207118e-06, - "loss": 0.1157, - "step": 654 - }, - { - "epoch": 4.0184049079754605, - "grad_norm": 3.634021520614624, - "learning_rate": 3.263371639764343e-06, - "loss": 0.0707, - "step": 655 - }, - { - "epoch": 4.024539877300613, - "grad_norm": 3.3474650382995605, - "learning_rate": 3.2587819546070596e-06, - "loss": 0.1067, - "step": 656 - }, - { - "epoch": 4.030674846625767, - "grad_norm": 4.409244537353516, - "learning_rate": 3.254189450798189e-06, - "loss": 0.0564, - "step": 657 - }, - { - "epoch": 4.03680981595092, - "grad_norm": 3.0446252822875977, - "learning_rate": 3.2495941453975312e-06, - "loss": 0.0535, - "step": 658 - }, - { - "epoch": 4.042944785276074, - "grad_norm": 4.014753818511963, - "learning_rate": 3.2449960554752935e-06, - "loss": 0.1245, - "step": 659 - }, - { - "epoch": 4.049079754601227, - "grad_norm": 3.188062906265259, - "learning_rate": 3.240395198112026e-06, - "loss": 0.0626, - "step": 660 - }, - { - "epoch": 4.0552147239263805, - "grad_norm": 3.006086826324463, - "learning_rate": 3.2357915903985605e-06, - "loss": 0.1198, - "step": 661 - }, - { - "epoch": 4.061349693251533, - "grad_norm": 2.8865551948547363, - "learning_rate": 3.2311852494359423e-06, - "loss": 0.0454, - "step": 662 - }, - { - "epoch": 4.067484662576687, - "grad_norm": 4.2888007164001465, - "learning_rate": 3.226576192335373e-06, - "loss": 0.2064, - "step": 663 - }, - { - "epoch": 4.07361963190184, - "grad_norm": 3.1414525508880615, - "learning_rate": 3.2219644362181436e-06, - "loss": 0.2183, - "step": 664 - }, - { - "epoch": 4.079754601226994, - "grad_norm": 2.556277275085449, - "learning_rate": 3.21734999821557e-06, - "loss": 0.0516, - "step": 665 - }, - { - "epoch": 4.085889570552148, - "grad_norm": 2.698118209838867, - "learning_rate": 3.2127328954689307e-06, - "loss": 0.0613, - "step": 666 - }, - { - "epoch": 4.0920245398773005, - "grad_norm": 2.869919538497925, - "learning_rate": 3.2081131451294025e-06, - "loss": 0.0583, - "step": 667 - }, - { - "epoch": 4.098159509202454, - "grad_norm": 3.8786919116973877, - "learning_rate": 3.2034907643579988e-06, - "loss": 0.0766, - "step": 668 - }, - { - "epoch": 4.104294478527607, - "grad_norm": 4.224637031555176, - "learning_rate": 3.1988657703255043e-06, - "loss": 0.1099, - "step": 669 - }, - { - "epoch": 4.110429447852761, - "grad_norm": 4.671669006347656, - "learning_rate": 3.194238180212409e-06, - "loss": 0.1663, - "step": 670 - }, - { - "epoch": 4.116564417177914, - "grad_norm": 3.2484257221221924, - "learning_rate": 3.1896080112088477e-06, - "loss": 0.0587, - "step": 671 - }, - { - "epoch": 4.122699386503068, - "grad_norm": 2.4808075428009033, - "learning_rate": 3.184975280514536e-06, - "loss": 0.0579, - "step": 672 - }, - { - "epoch": 4.128834355828221, - "grad_norm": 3.7106919288635254, - "learning_rate": 3.1803400053387044e-06, - "loss": 0.1083, - "step": 673 - }, - { - "epoch": 4.134969325153374, - "grad_norm": 3.008970260620117, - "learning_rate": 3.175702202900036e-06, - "loss": 0.1355, - "step": 674 - }, - { - "epoch": 4.141104294478527, - "grad_norm": 3.2640793323516846, - "learning_rate": 3.1710618904266006e-06, - "loss": 0.092, - "step": 675 - }, - { - "epoch": 4.147239263803681, - "grad_norm": 3.08042049407959, - "learning_rate": 3.166419085155793e-06, - "loss": 0.0563, - "step": 676 - }, - { - "epoch": 4.153374233128835, - "grad_norm": 2.993530511856079, - "learning_rate": 3.1617738043342695e-06, - "loss": 0.1773, - "step": 677 - }, - { - "epoch": 4.159509202453988, - "grad_norm": 2.6218204498291016, - "learning_rate": 3.157126065217879e-06, - "loss": 0.0489, - "step": 678 - }, - { - "epoch": 4.1656441717791415, - "grad_norm": 4.3173723220825195, - "learning_rate": 3.152475885071606e-06, - "loss": 0.1333, - "step": 679 - }, - { - "epoch": 4.171779141104294, - "grad_norm": 3.659149408340454, - "learning_rate": 3.147823281169498e-06, - "loss": 0.1501, - "step": 680 - }, - { - "epoch": 4.177914110429448, - "grad_norm": 3.0953338146209717, - "learning_rate": 3.143168270794612e-06, - "loss": 0.1067, - "step": 681 - }, - { - "epoch": 4.184049079754601, - "grad_norm": 3.5693907737731934, - "learning_rate": 3.1385108712389394e-06, - "loss": 0.2499, - "step": 682 - }, - { - "epoch": 4.190184049079755, - "grad_norm": 3.3022868633270264, - "learning_rate": 3.1338510998033484e-06, - "loss": 0.1748, - "step": 683 - }, - { - "epoch": 4.196319018404908, - "grad_norm": 3.7468113899230957, - "learning_rate": 3.129188973797519e-06, - "loss": 0.201, - "step": 684 - }, - { - "epoch": 4.2024539877300615, - "grad_norm": 2.8381078243255615, - "learning_rate": 3.124524510539875e-06, - "loss": 0.0735, - "step": 685 - }, - { - "epoch": 4.208588957055214, - "grad_norm": 2.84706974029541, - "learning_rate": 3.119857727357527e-06, - "loss": 0.1806, - "step": 686 - }, - { - "epoch": 4.214723926380368, - "grad_norm": 3.8130292892456055, - "learning_rate": 3.1151886415861993e-06, - "loss": 0.1811, - "step": 687 - }, - { - "epoch": 4.220858895705521, - "grad_norm": 3.528895378112793, - "learning_rate": 3.1105172705701708e-06, - "loss": 0.1634, - "step": 688 - }, - { - "epoch": 4.226993865030675, - "grad_norm": 5.028727054595947, - "learning_rate": 3.1058436316622103e-06, - "loss": 0.1625, - "step": 689 - }, - { - "epoch": 4.233128834355828, - "grad_norm": 4.606889247894287, - "learning_rate": 3.1011677422235093e-06, - "loss": 0.1791, - "step": 690 - }, - { - "epoch": 4.2392638036809815, - "grad_norm": 3.3620636463165283, - "learning_rate": 3.0964896196236217e-06, - "loss": 0.2233, - "step": 691 - }, - { - "epoch": 4.245398773006135, - "grad_norm": 3.7845852375030518, - "learning_rate": 3.0918092812403954e-06, - "loss": 0.1142, - "step": 692 - }, - { - "epoch": 4.251533742331288, - "grad_norm": 3.1204118728637695, - "learning_rate": 3.0871267444599098e-06, - "loss": 0.096, - "step": 693 - }, - { - "epoch": 4.257668711656442, - "grad_norm": 3.686067819595337, - "learning_rate": 3.0824420266764093e-06, - "loss": 0.2749, - "step": 694 - }, - { - "epoch": 4.263803680981595, - "grad_norm": 3.1680829524993896, - "learning_rate": 3.077755145292243e-06, - "loss": 0.2504, - "step": 695 - }, - { - "epoch": 4.269938650306749, - "grad_norm": 3.3179469108581543, - "learning_rate": 3.0730661177177957e-06, - "loss": 0.1324, - "step": 696 - }, - { - "epoch": 4.276073619631902, - "grad_norm": 3.1186370849609375, - "learning_rate": 3.0683749613714238e-06, - "loss": 0.0691, - "step": 697 - }, - { - "epoch": 4.282208588957055, - "grad_norm": 3.086834192276001, - "learning_rate": 3.063681693679391e-06, - "loss": 0.1026, - "step": 698 - }, - { - "epoch": 4.288343558282208, - "grad_norm": 4.629584312438965, - "learning_rate": 3.0589863320758063e-06, - "loss": 0.2646, - "step": 699 - }, - { - "epoch": 4.294478527607362, - "grad_norm": 3.9641213417053223, - "learning_rate": 3.0542888940025562e-06, - "loss": 0.1711, - "step": 700 - }, - { - "epoch": 4.300613496932515, - "grad_norm": 3.75014328956604, - "learning_rate": 3.0495893969092395e-06, - "loss": 0.0589, - "step": 701 - }, - { - "epoch": 4.306748466257669, - "grad_norm": 3.603290319442749, - "learning_rate": 3.044887858253105e-06, - "loss": 0.2244, - "step": 702 - }, - { - "epoch": 4.3128834355828225, - "grad_norm": 3.79404616355896, - "learning_rate": 3.040184295498984e-06, - "loss": 0.1506, - "step": 703 - }, - { - "epoch": 4.319018404907975, - "grad_norm": 3.0890021324157715, - "learning_rate": 3.035478726119228e-06, - "loss": 0.2343, - "step": 704 - }, - { - "epoch": 4.325153374233129, - "grad_norm": 3.6688191890716553, - "learning_rate": 3.0307711675936426e-06, - "loss": 0.0518, - "step": 705 - }, - { - "epoch": 4.331288343558282, - "grad_norm": 5.1836700439453125, - "learning_rate": 3.0260616374094208e-06, - "loss": 0.2363, - "step": 706 - }, - { - "epoch": 4.337423312883436, - "grad_norm": 2.7123284339904785, - "learning_rate": 3.0213501530610807e-06, - "loss": 0.0848, - "step": 707 - }, - { - "epoch": 4.343558282208589, - "grad_norm": 3.5661890506744385, - "learning_rate": 3.0166367320504005e-06, - "loss": 0.149, - "step": 708 - }, - { - "epoch": 4.3496932515337425, - "grad_norm": 3.6454737186431885, - "learning_rate": 3.0119213918863515e-06, - "loss": 0.1133, - "step": 709 - }, - { - "epoch": 4.355828220858895, - "grad_norm": 3.7534968852996826, - "learning_rate": 3.0072041500850343e-06, - "loss": 0.1358, - "step": 710 - }, - { - "epoch": 4.361963190184049, - "grad_norm": 3.40387225151062, - "learning_rate": 3.0024850241696128e-06, - "loss": 0.0706, - "step": 711 - }, - { - "epoch": 4.368098159509202, - "grad_norm": 3.250471591949463, - "learning_rate": 2.9977640316702512e-06, - "loss": 0.1977, - "step": 712 - }, - { - "epoch": 4.374233128834356, - "grad_norm": 3.417781352996826, - "learning_rate": 2.993041190124047e-06, - "loss": 0.2622, - "step": 713 - }, - { - "epoch": 4.38036809815951, - "grad_norm": 2.628434181213379, - "learning_rate": 2.9883165170749657e-06, - "loss": 0.1487, - "step": 714 - }, - { - "epoch": 4.386503067484663, - "grad_norm": 3.240264892578125, - "learning_rate": 2.9835900300737763e-06, - "loss": 0.0822, - "step": 715 - }, - { - "epoch": 4.392638036809816, - "grad_norm": 6.575517177581787, - "learning_rate": 2.9788617466779884e-06, - "loss": 0.3668, - "step": 716 - }, - { - "epoch": 4.398773006134969, - "grad_norm": 4.699089050292969, - "learning_rate": 2.974131684451781e-06, - "loss": 0.2432, - "step": 717 - }, - { - "epoch": 4.404907975460123, - "grad_norm": 2.9815752506256104, - "learning_rate": 2.9693998609659443e-06, - "loss": 0.0689, - "step": 718 - }, - { - "epoch": 4.411042944785276, - "grad_norm": 4.192755222320557, - "learning_rate": 2.9646662937978082e-06, - "loss": 0.1897, - "step": 719 - }, - { - "epoch": 4.41717791411043, - "grad_norm": 2.9729068279266357, - "learning_rate": 2.9599310005311824e-06, - "loss": 0.0457, - "step": 720 - }, - { - "epoch": 4.423312883435583, - "grad_norm": 4.234438896179199, - "learning_rate": 2.9551939987562866e-06, - "loss": 0.2307, - "step": 721 - }, - { - "epoch": 4.429447852760736, - "grad_norm": 3.3982434272766113, - "learning_rate": 2.950455306069688e-06, - "loss": 0.0637, - "step": 722 - }, - { - "epoch": 4.435582822085889, - "grad_norm": 4.539764404296875, - "learning_rate": 2.9457149400742357e-06, - "loss": 0.1924, - "step": 723 - }, - { - "epoch": 4.441717791411043, - "grad_norm": 4.039684772491455, - "learning_rate": 2.940972918378993e-06, - "loss": 0.1275, - "step": 724 - }, - { - "epoch": 4.447852760736196, - "grad_norm": 4.340360641479492, - "learning_rate": 2.936229258599174e-06, - "loss": 0.123, - "step": 725 - }, - { - "epoch": 4.45398773006135, - "grad_norm": 2.8720109462738037, - "learning_rate": 2.93148397835608e-06, - "loss": 0.0555, - "step": 726 - }, - { - "epoch": 4.460122699386503, - "grad_norm": 4.227811336517334, - "learning_rate": 2.926737095277029e-06, - "loss": 0.0991, - "step": 727 - }, - { - "epoch": 4.466257668711656, - "grad_norm": 2.8079142570495605, - "learning_rate": 2.921988626995295e-06, - "loss": 0.0628, - "step": 728 - }, - { - "epoch": 4.47239263803681, - "grad_norm": 4.195122241973877, - "learning_rate": 2.9172385911500385e-06, - "loss": 0.2333, - "step": 729 - }, - { - "epoch": 4.478527607361963, - "grad_norm": 3.223794460296631, - "learning_rate": 2.9124870053862447e-06, - "loss": 0.1317, - "step": 730 - }, - { - "epoch": 4.484662576687117, - "grad_norm": 3.5533759593963623, - "learning_rate": 2.907733887354657e-06, - "loss": 0.2285, - "step": 731 - }, - { - "epoch": 4.49079754601227, - "grad_norm": 3.535673141479492, - "learning_rate": 2.9029792547117088e-06, - "loss": 0.096, - "step": 732 - }, - { - "epoch": 4.4969325153374236, - "grad_norm": 4.031703948974609, - "learning_rate": 2.898223125119461e-06, - "loss": 0.1505, - "step": 733 - }, - { - "epoch": 4.5030674846625764, - "grad_norm": 2.823413610458374, - "learning_rate": 2.893465516245534e-06, - "loss": 0.0327, - "step": 734 - }, - { - "epoch": 4.50920245398773, - "grad_norm": 3.516738176345825, - "learning_rate": 2.8887064457630453e-06, - "loss": 0.0743, - "step": 735 - }, - { - "epoch": 4.515337423312883, - "grad_norm": 3.5523500442504883, - "learning_rate": 2.8839459313505407e-06, - "loss": 0.1768, - "step": 736 - }, - { - "epoch": 4.521472392638037, - "grad_norm": 3.2433223724365234, - "learning_rate": 2.879183990691929e-06, - "loss": 0.1598, - "step": 737 - }, - { - "epoch": 4.52760736196319, - "grad_norm": 3.0156848430633545, - "learning_rate": 2.8744206414764185e-06, - "loss": 0.0829, - "step": 738 - }, - { - "epoch": 4.533742331288344, - "grad_norm": 4.359529495239258, - "learning_rate": 2.8696559013984488e-06, - "loss": 0.1169, - "step": 739 - }, - { - "epoch": 4.539877300613497, - "grad_norm": 2.3862433433532715, - "learning_rate": 2.8648897881576274e-06, - "loss": 0.0962, - "step": 740 - }, - { - "epoch": 4.54601226993865, - "grad_norm": 2.7100136280059814, - "learning_rate": 2.8601223194586613e-06, - "loss": 0.1204, - "step": 741 - }, - { - "epoch": 4.552147239263804, - "grad_norm": 3.8116140365600586, - "learning_rate": 2.8553535130112935e-06, - "loss": 0.0685, - "step": 742 - }, - { - "epoch": 4.558282208588957, - "grad_norm": 2.9640142917633057, - "learning_rate": 2.850583386530235e-06, - "loss": 0.0692, - "step": 743 - }, - { - "epoch": 4.564417177914111, - "grad_norm": 3.264592170715332, - "learning_rate": 2.8458119577351035e-06, - "loss": 0.2128, - "step": 744 - }, - { - "epoch": 4.570552147239264, - "grad_norm": 3.230497360229492, - "learning_rate": 2.841039244350351e-06, - "loss": 0.2409, - "step": 745 - }, - { - "epoch": 4.576687116564417, - "grad_norm": 4.41513204574585, - "learning_rate": 2.8362652641052024e-06, - "loss": 0.1878, - "step": 746 - }, - { - "epoch": 4.58282208588957, - "grad_norm": 3.047248601913452, - "learning_rate": 2.83149003473359e-06, - "loss": 0.1303, - "step": 747 - }, - { - "epoch": 4.588957055214724, - "grad_norm": 2.399754047393799, - "learning_rate": 2.8267135739740836e-06, - "loss": 0.0577, - "step": 748 - }, - { - "epoch": 4.595092024539877, - "grad_norm": 4.608038425445557, - "learning_rate": 2.8219358995698307e-06, - "loss": 0.2329, - "step": 749 - }, - { - "epoch": 4.601226993865031, - "grad_norm": 3.537644147872925, - "learning_rate": 2.8171570292684846e-06, - "loss": 0.1329, - "step": 750 - }, - { - "epoch": 4.6073619631901845, - "grad_norm": 2.8099827766418457, - "learning_rate": 2.8123769808221407e-06, - "loss": 0.1512, - "step": 751 - }, - { - "epoch": 4.613496932515337, - "grad_norm": 3.3169758319854736, - "learning_rate": 2.8075957719872724e-06, - "loss": 0.1267, - "step": 752 - }, - { - "epoch": 4.61963190184049, - "grad_norm": 3.578435182571411, - "learning_rate": 2.8028134205246633e-06, - "loss": 0.147, - "step": 753 - }, - { - "epoch": 4.625766871165644, - "grad_norm": 3.544437885284424, - "learning_rate": 2.7980299441993415e-06, - "loss": 0.0947, - "step": 754 - }, - { - "epoch": 4.631901840490798, - "grad_norm": 3.798776388168335, - "learning_rate": 2.793245360780512e-06, - "loss": 0.1498, - "step": 755 - }, - { - "epoch": 4.638036809815951, - "grad_norm": 3.634991407394409, - "learning_rate": 2.788459688041495e-06, - "loss": 0.2504, - "step": 756 - }, - { - "epoch": 4.644171779141105, - "grad_norm": 20.123680114746094, - "learning_rate": 2.783672943759655e-06, - "loss": 0.2091, - "step": 757 - }, - { - "epoch": 4.6503067484662575, - "grad_norm": 3.9357221126556396, - "learning_rate": 2.778885145716339e-06, - "loss": 0.2045, - "step": 758 - }, - { - "epoch": 4.656441717791411, - "grad_norm": 3.3035309314727783, - "learning_rate": 2.7740963116968063e-06, - "loss": 0.1416, - "step": 759 - }, - { - "epoch": 4.662576687116564, - "grad_norm": 3.096985101699829, - "learning_rate": 2.7693064594901646e-06, - "loss": 0.0455, - "step": 760 - }, - { - "epoch": 4.668711656441718, - "grad_norm": 2.9855458736419678, - "learning_rate": 2.7645156068893075e-06, - "loss": 0.1496, - "step": 761 - }, - { - "epoch": 4.674846625766871, - "grad_norm": 3.9140093326568604, - "learning_rate": 2.759723771690839e-06, - "loss": 0.2061, - "step": 762 - }, - { - "epoch": 4.680981595092025, - "grad_norm": 3.590569496154785, - "learning_rate": 2.754930971695019e-06, - "loss": 0.1017, - "step": 763 - }, - { - "epoch": 4.6871165644171775, - "grad_norm": 3.527254581451416, - "learning_rate": 2.750137224705687e-06, - "loss": 0.1979, - "step": 764 - }, - { - "epoch": 4.693251533742331, - "grad_norm": 4.198459148406982, - "learning_rate": 2.745342548530202e-06, - "loss": 0.1667, - "step": 765 - }, - { - "epoch": 4.699386503067485, - "grad_norm": 2.0246167182922363, - "learning_rate": 2.7405469609793746e-06, - "loss": 0.0346, - "step": 766 - }, - { - "epoch": 4.705521472392638, - "grad_norm": 3.2045300006866455, - "learning_rate": 2.7357504798674004e-06, - "loss": 0.0596, - "step": 767 - }, - { - "epoch": 4.711656441717792, - "grad_norm": 2.736985921859741, - "learning_rate": 2.730953123011796e-06, - "loss": 0.0384, - "step": 768 - }, - { - "epoch": 4.717791411042945, - "grad_norm": 3.0621395111083984, - "learning_rate": 2.726154908233328e-06, - "loss": 0.0558, - "step": 769 - }, - { - "epoch": 4.723926380368098, - "grad_norm": 3.2280497550964355, - "learning_rate": 2.721355853355953e-06, - "loss": 0.2272, - "step": 770 - }, - { - "epoch": 4.730061349693251, - "grad_norm": 3.342226028442383, - "learning_rate": 2.716555976206748e-06, - "loss": 0.074, - "step": 771 - }, - { - "epoch": 4.736196319018405, - "grad_norm": 4.328624248504639, - "learning_rate": 2.7117552946158415e-06, - "loss": 0.1034, - "step": 772 - }, - { - "epoch": 4.742331288343558, - "grad_norm": 2.980215311050415, - "learning_rate": 2.706953826416353e-06, - "loss": 0.1199, - "step": 773 - }, - { - "epoch": 4.748466257668712, - "grad_norm": 2.622478485107422, - "learning_rate": 2.702151589444324e-06, - "loss": 0.0467, - "step": 774 - }, - { - "epoch": 4.754601226993865, - "grad_norm": 2.9958693981170654, - "learning_rate": 2.6973486015386507e-06, - "loss": 0.143, - "step": 775 - }, - { - "epoch": 4.7607361963190185, - "grad_norm": 4.548511505126953, - "learning_rate": 2.6925448805410197e-06, - "loss": 0.3594, - "step": 776 - }, - { - "epoch": 4.766871165644172, - "grad_norm": 3.3429481983184814, - "learning_rate": 2.6877404442958393e-06, - "loss": 0.1397, - "step": 777 - }, - { - "epoch": 4.773006134969325, - "grad_norm": 2.5820136070251465, - "learning_rate": 2.682935310650177e-06, - "loss": 0.054, - "step": 778 - }, - { - "epoch": 4.779141104294479, - "grad_norm": 4.047626495361328, - "learning_rate": 2.6781294974536886e-06, - "loss": 0.1284, - "step": 779 - }, - { - "epoch": 4.785276073619632, - "grad_norm": 3.0227510929107666, - "learning_rate": 2.673323022558557e-06, - "loss": 0.1441, - "step": 780 - }, - { - "epoch": 4.791411042944786, - "grad_norm": 4.731313705444336, - "learning_rate": 2.6685159038194202e-06, - "loss": 0.2859, - "step": 781 - }, - { - "epoch": 4.7975460122699385, - "grad_norm": 3.880655288696289, - "learning_rate": 2.6637081590933096e-06, - "loss": 0.1524, - "step": 782 - }, - { - "epoch": 4.803680981595092, - "grad_norm": 2.375474452972412, - "learning_rate": 2.6588998062395803e-06, - "loss": 0.0338, - "step": 783 - }, - { - "epoch": 4.809815950920245, - "grad_norm": 3.3587446212768555, - "learning_rate": 2.6540908631198498e-06, - "loss": 0.0755, - "step": 784 - }, - { - "epoch": 4.815950920245399, - "grad_norm": 2.767686367034912, - "learning_rate": 2.6492813475979243e-06, - "loss": 0.0631, - "step": 785 - }, - { - "epoch": 4.822085889570552, - "grad_norm": 3.88670015335083, - "learning_rate": 2.6444712775397397e-06, - "loss": 0.0853, - "step": 786 - }, - { - "epoch": 4.828220858895706, - "grad_norm": 3.543276309967041, - "learning_rate": 2.639660670813288e-06, - "loss": 0.1895, - "step": 787 - }, - { - "epoch": 4.8343558282208585, - "grad_norm": 3.659323215484619, - "learning_rate": 2.6348495452885598e-06, - "loss": 0.1745, - "step": 788 - }, - { - "epoch": 4.840490797546012, - "grad_norm": 3.0955021381378174, - "learning_rate": 2.630037918837468e-06, - "loss": 0.0846, - "step": 789 - }, - { - "epoch": 4.846625766871165, - "grad_norm": 3.4473249912261963, - "learning_rate": 2.6252258093337892e-06, - "loss": 0.0808, - "step": 790 - }, - { - "epoch": 4.852760736196319, - "grad_norm": 3.937120199203491, - "learning_rate": 2.6204132346530936e-06, - "loss": 0.2054, - "step": 791 - }, - { - "epoch": 4.858895705521473, - "grad_norm": 4.052806854248047, - "learning_rate": 2.6156002126726788e-06, - "loss": 0.1679, - "step": 792 - }, - { - "epoch": 4.865030674846626, - "grad_norm": 2.6694889068603516, - "learning_rate": 2.6107867612715043e-06, - "loss": 0.0534, - "step": 793 - }, - { - "epoch": 4.871165644171779, - "grad_norm": 3.594649076461792, - "learning_rate": 2.6059728983301267e-06, - "loss": 0.0899, - "step": 794 - }, - { - "epoch": 4.877300613496932, - "grad_norm": 2.7796030044555664, - "learning_rate": 2.601158641730629e-06, - "loss": 0.0596, - "step": 795 - }, - { - "epoch": 4.883435582822086, - "grad_norm": 4.618961334228516, - "learning_rate": 2.5963440093565567e-06, - "loss": 0.3858, - "step": 796 - }, - { - "epoch": 4.889570552147239, - "grad_norm": 3.0783939361572266, - "learning_rate": 2.5915290190928518e-06, - "loss": 0.12, - "step": 797 - }, - { - "epoch": 4.895705521472393, - "grad_norm": 4.078456878662109, - "learning_rate": 2.586713688825786e-06, - "loss": 0.1278, - "step": 798 - }, - { - "epoch": 4.901840490797546, - "grad_norm": 2.9439120292663574, - "learning_rate": 2.5818980364428935e-06, - "loss": 0.0847, - "step": 799 - }, - { - "epoch": 4.9079754601226995, - "grad_norm": 5.140681743621826, - "learning_rate": 2.5770820798329055e-06, - "loss": 0.1718, - "step": 800 - }, - { - "epoch": 4.914110429447852, - "grad_norm": 3.450190305709839, - "learning_rate": 2.572265836885682e-06, - "loss": 0.0895, - "step": 801 - }, - { - "epoch": 4.920245398773006, - "grad_norm": 3.1145224571228027, - "learning_rate": 2.567449325492149e-06, - "loss": 0.0652, - "step": 802 - }, - { - "epoch": 4.92638036809816, - "grad_norm": 2.851768732070923, - "learning_rate": 2.5626325635442283e-06, - "loss": 0.0877, - "step": 803 - }, - { - "epoch": 4.932515337423313, - "grad_norm": 3.3392980098724365, - "learning_rate": 2.5578155689347716e-06, - "loss": 0.2028, - "step": 804 - }, - { - "epoch": 4.938650306748467, - "grad_norm": 3.012439250946045, - "learning_rate": 2.5529983595574964e-06, - "loss": 0.031, - "step": 805 - }, - { - "epoch": 4.9447852760736195, - "grad_norm": 2.7732717990875244, - "learning_rate": 2.548180953306918e-06, - "loss": 0.0415, - "step": 806 - }, - { - "epoch": 4.950920245398773, - "grad_norm": 3.0423903465270996, - "learning_rate": 2.5433633680782817e-06, - "loss": 0.1188, - "step": 807 - }, - { - "epoch": 4.957055214723926, - "grad_norm": 5.056387901306152, - "learning_rate": 2.538545621767498e-06, - "loss": 0.1703, - "step": 808 - }, - { - "epoch": 4.96319018404908, - "grad_norm": 4.052585124969482, - "learning_rate": 2.533727732271077e-06, - "loss": 0.1455, - "step": 809 - }, - { - "epoch": 4.969325153374233, - "grad_norm": 3.4507904052734375, - "learning_rate": 2.5289097174860593e-06, - "loss": 0.0617, - "step": 810 - }, - { - "epoch": 4.975460122699387, - "grad_norm": 2.908266305923462, - "learning_rate": 2.524091595309952e-06, - "loss": 0.1173, - "step": 811 - }, - { - "epoch": 4.9815950920245395, - "grad_norm": 2.5857458114624023, - "learning_rate": 2.519273383640661e-06, - "loss": 0.0538, - "step": 812 - }, - { - "epoch": 4.987730061349693, - "grad_norm": 3.3518428802490234, - "learning_rate": 2.5144551003764227e-06, - "loss": 0.211, - "step": 813 - }, - { - "epoch": 4.993865030674847, - "grad_norm": 3.137981653213501, - "learning_rate": 2.509636763415742e-06, - "loss": 0.0944, - "step": 814 - }, - { - "epoch": 5.0, - "grad_norm": 2.8854241371154785, - "learning_rate": 2.5048183906573227e-06, - "loss": 0.098, - "step": 815 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 2.0186326817046528e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/chat_template.jinja b/metallama3_8b/limo_filtered_correct/checkpoint-978/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/config.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/generation_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00001-of-00007.safetensors deleted file mode 100644 index 2318ec7402a3157f974900f2074d61429b695631..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e599bda795ade1982f720639df07ce8cf8faf7d0fe047a5625d11dd734260ae -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00002-of-00007.safetensors deleted file mode 100644 index f4eae26c071380eba79b3fbbb64ebd0015e94a40..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f9db6abd234a36db37b40ae060829dbe6b509134349b6d1f216b7fa1e066569 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00003-of-00007.safetensors deleted file mode 100644 index 546cf966570180acb9d0d6e78e97e29ed62c2ce2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f0b50f99f64f9d2a4397048689f01bb0fc2d5785e303a747f8fb7acae2e74ff -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00004-of-00007.safetensors deleted file mode 100644 index b10492dabeab835e0eed6e7a4e8e3e1f93388969..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:921d09df2b253bc88000a7d410a6e8c5f084fe3b5b24ff2f61f63a912d2705da -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00005-of-00007.safetensors deleted file mode 100644 index a1e7fc4d02587b346c700d229ba0a4e6bc375000..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b497591b1f4dd8473c1b4f5c615309680fd89dbd9ca303abf4817552fcd53da -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00006-of-00007.safetensors deleted file mode 100644 index 3d00973d0d6b3bb7a1fd98484dba58bf21a3ef28..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:464951c21bbe99b3f1a82677711f2d91a16e6f8e7c186370979db23e83fbf905 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00007-of-00007.safetensors deleted file mode 100644 index fc9ee5f32ab0179969aa92719975e86e5f3cb2c6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79520f1e0132e363725514a0a9067aec0f0d0cb8e53137a500bb03e67a6b93c1 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_0.pth b/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_0.pth deleted file mode 100644 index c54ea122b283c04f6b60c1eedefeb301763a8f9f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:418a5f105ae834c3075024076916b2a9475918fe034c12d0dd5b6d91f1aba467 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_1.pth b/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_1.pth deleted file mode 100644 index ea57ead2533e587fe50f62107d7cb32945fe1354..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e07ace389d24bc1307b74f42a1e7b8f0117b0db853e2df64ff3f15cb92916a2 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_2.pth b/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_2.pth deleted file mode 100644 index 4689a9445d07528dc4fd91011a7f034c11773a68..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da6a990f346d7014dffb28fa2bc7d3b890bd3c53712503fce3656da48d3d6e50 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_3.pth b/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_3.pth deleted file mode 100644 index 919b5e43a96a9afdeb196f402142bc3aab67f247..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e95f356ca38179b05993f55daece0223e96fa10b9a1b9ea2102a739211333f63 -size 15024 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/scheduler.pt b/metallama3_8b/limo_filtered_correct/checkpoint-978/scheduler.pt deleted file mode 100644 index f2a36b50f436972e40a595bfb6a747ab39f3f80d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3d79159aa72bcc02d621bd3aedc7e0f8def5572b5e21719aabc962afd6eaca12 -size 1064 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/checkpoint-978/trainer_state.json b/metallama3_8b/limo_filtered_correct/checkpoint-978/trainer_state.json deleted file mode 100644 index 17dff7da7fd9ce0c2508f372ba60a87e064d249b..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/checkpoint-978/trainer_state.json +++ /dev/null @@ -1,6880 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 6.0, - "eval_steps": 500, - "global_step": 978, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - }, - { - "epoch": 3.0061349693251533, - "grad_norm": 5.331607818603516, - "learning_rate": 3.969463130731183e-06, - "loss": 0.3267, - "step": 490 - }, - { - "epoch": 3.0122699386503067, - "grad_norm": 3.453650712966919, - "learning_rate": 3.965562241492401e-06, - "loss": 0.2719, - "step": 491 - }, - { - "epoch": 3.01840490797546, - "grad_norm": 3.232313632965088, - "learning_rate": 3.9616559081213335e-06, - "loss": 0.1825, - "step": 492 - }, - { - "epoch": 3.0245398773006134, - "grad_norm": 3.4860260486602783, - "learning_rate": 3.957744145128858e-06, - "loss": 0.1854, - "step": 493 - }, - { - "epoch": 3.0306748466257667, - "grad_norm": 3.4357805252075195, - "learning_rate": 3.953826967046021e-06, - "loss": 0.2224, - "step": 494 - }, - { - "epoch": 3.03680981595092, - "grad_norm": 4.557503700256348, - "learning_rate": 3.9499043884239894e-06, - "loss": 0.349, - "step": 495 - }, - { - "epoch": 3.042944785276074, - "grad_norm": 4.685214042663574, - "learning_rate": 3.945976423833987e-06, - "loss": 0.175, - "step": 496 - }, - { - "epoch": 3.049079754601227, - "grad_norm": 3.7430171966552734, - "learning_rate": 3.942043087867244e-06, - "loss": 0.2773, - "step": 497 - }, - { - "epoch": 3.0552147239263805, - "grad_norm": 3.756450653076172, - "learning_rate": 3.938104395134947e-06, - "loss": 0.4445, - "step": 498 - }, - { - "epoch": 3.061349693251534, - "grad_norm": 4.049175262451172, - "learning_rate": 3.9341603602681805e-06, - "loss": 0.3046, - "step": 499 - }, - { - "epoch": 3.067484662576687, - "grad_norm": 3.7689461708068848, - "learning_rate": 3.930210997917871e-06, - "loss": 0.2544, - "step": 500 - }, - { - "epoch": 3.0736196319018405, - "grad_norm": 4.027602195739746, - "learning_rate": 3.92625632275474e-06, - "loss": 0.3154, - "step": 501 - }, - { - "epoch": 3.079754601226994, - "grad_norm": 2.8449292182922363, - "learning_rate": 3.922296349469239e-06, - "loss": 0.2804, - "step": 502 - }, - { - "epoch": 3.085889570552147, - "grad_norm": 2.9555234909057617, - "learning_rate": 3.918331092771505e-06, - "loss": 0.2393, - "step": 503 - }, - { - "epoch": 3.0920245398773005, - "grad_norm": 2.621042013168335, - "learning_rate": 3.914360567391296e-06, - "loss": 0.1403, - "step": 504 - }, - { - "epoch": 3.098159509202454, - "grad_norm": 3.2348620891571045, - "learning_rate": 3.910384788077949e-06, - "loss": 0.1537, - "step": 505 - }, - { - "epoch": 3.104294478527607, - "grad_norm": 3.030179977416992, - "learning_rate": 3.906403769600311e-06, - "loss": 0.2921, - "step": 506 - }, - { - "epoch": 3.1104294478527605, - "grad_norm": 3.146428346633911, - "learning_rate": 3.902417526746694e-06, - "loss": 0.2036, - "step": 507 - }, - { - "epoch": 3.116564417177914, - "grad_norm": 3.6201512813568115, - "learning_rate": 3.898426074324818e-06, - "loss": 0.2655, - "step": 508 - }, - { - "epoch": 3.1226993865030677, - "grad_norm": 3.7674012184143066, - "learning_rate": 3.8944294271617524e-06, - "loss": 0.3938, - "step": 509 - }, - { - "epoch": 3.128834355828221, - "grad_norm": 4.54722785949707, - "learning_rate": 3.890427600103865e-06, - "loss": 0.3051, - "step": 510 - }, - { - "epoch": 3.1349693251533743, - "grad_norm": 4.228236675262451, - "learning_rate": 3.886420608016767e-06, - "loss": 0.3719, - "step": 511 - }, - { - "epoch": 3.1411042944785277, - "grad_norm": 4.355110168457031, - "learning_rate": 3.882408465785252e-06, - "loss": 0.1863, - "step": 512 - }, - { - "epoch": 3.147239263803681, - "grad_norm": 3.451460838317871, - "learning_rate": 3.878391188313249e-06, - "loss": 0.1479, - "step": 513 - }, - { - "epoch": 3.1533742331288344, - "grad_norm": 4.395524501800537, - "learning_rate": 3.87436879052376e-06, - "loss": 0.238, - "step": 514 - }, - { - "epoch": 3.1595092024539877, - "grad_norm": 2.940717935562134, - "learning_rate": 3.870341287358809e-06, - "loss": 0.2069, - "step": 515 - }, - { - "epoch": 3.165644171779141, - "grad_norm": 2.5817320346832275, - "learning_rate": 3.8663086937793845e-06, - "loss": 0.1189, - "step": 516 - }, - { - "epoch": 3.1717791411042944, - "grad_norm": 3.9863343238830566, - "learning_rate": 3.862271024765385e-06, - "loss": 0.3434, - "step": 517 - }, - { - "epoch": 3.1779141104294477, - "grad_norm": 3.609004259109497, - "learning_rate": 3.8582282953155626e-06, - "loss": 0.1602, - "step": 518 - }, - { - "epoch": 3.184049079754601, - "grad_norm": 3.207533121109009, - "learning_rate": 3.854180520447465e-06, - "loss": 0.3452, - "step": 519 - }, - { - "epoch": 3.190184049079755, - "grad_norm": 3.593388795852661, - "learning_rate": 3.850127715197387e-06, - "loss": 0.2832, - "step": 520 - }, - { - "epoch": 3.196319018404908, - "grad_norm": 3.409064531326294, - "learning_rate": 3.846069894620306e-06, - "loss": 0.1481, - "step": 521 - }, - { - "epoch": 3.2024539877300615, - "grad_norm": 3.461498737335205, - "learning_rate": 3.84200707378983e-06, - "loss": 0.1283, - "step": 522 - }, - { - "epoch": 3.208588957055215, - "grad_norm": 3.708467483520508, - "learning_rate": 3.8379392677981434e-06, - "loss": 0.2468, - "step": 523 - }, - { - "epoch": 3.214723926380368, - "grad_norm": 2.802381753921509, - "learning_rate": 3.833866491755947e-06, - "loss": 0.2685, - "step": 524 - }, - { - "epoch": 3.2208588957055215, - "grad_norm": 3.0787744522094727, - "learning_rate": 3.8297887607924044e-06, - "loss": 0.2595, - "step": 525 - }, - { - "epoch": 3.226993865030675, - "grad_norm": 3.3952548503875732, - "learning_rate": 3.825706090055088e-06, - "loss": 0.4099, - "step": 526 - }, - { - "epoch": 3.233128834355828, - "grad_norm": 3.3497085571289062, - "learning_rate": 3.821618494709916e-06, - "loss": 0.287, - "step": 527 - }, - { - "epoch": 3.2392638036809815, - "grad_norm": 4.050611972808838, - "learning_rate": 3.817525989941102e-06, - "loss": 0.2369, - "step": 528 - }, - { - "epoch": 3.245398773006135, - "grad_norm": 2.87642240524292, - "learning_rate": 3.8134285909510972e-06, - "loss": 0.2751, - "step": 529 - }, - { - "epoch": 3.2515337423312882, - "grad_norm": 3.821941614151001, - "learning_rate": 3.8093263129605305e-06, - "loss": 0.2363, - "step": 530 - }, - { - "epoch": 3.2576687116564416, - "grad_norm": 2.8066117763519287, - "learning_rate": 3.80521917120816e-06, - "loss": 0.094, - "step": 531 - }, - { - "epoch": 3.263803680981595, - "grad_norm": 3.849768877029419, - "learning_rate": 3.801107180950806e-06, - "loss": 0.4117, - "step": 532 - }, - { - "epoch": 3.2699386503067487, - "grad_norm": 2.4161250591278076, - "learning_rate": 3.7969903574633028e-06, - "loss": 0.1183, - "step": 533 - }, - { - "epoch": 3.276073619631902, - "grad_norm": 3.6743111610412598, - "learning_rate": 3.792868716038437e-06, - "loss": 0.2296, - "step": 534 - }, - { - "epoch": 3.2822085889570554, - "grad_norm": 4.378123760223389, - "learning_rate": 3.7887422719868937e-06, - "loss": 0.2678, - "step": 535 - }, - { - "epoch": 3.2883435582822087, - "grad_norm": 4.816481590270996, - "learning_rate": 3.784611040637198e-06, - "loss": 0.4887, - "step": 536 - }, - { - "epoch": 3.294478527607362, - "grad_norm": 3.5712430477142334, - "learning_rate": 3.7804750373356576e-06, - "loss": 0.3827, - "step": 537 - }, - { - "epoch": 3.3006134969325154, - "grad_norm": 3.6877355575561523, - "learning_rate": 3.776334277446307e-06, - "loss": 0.3233, - "step": 538 - }, - { - "epoch": 3.3067484662576687, - "grad_norm": 3.442706346511841, - "learning_rate": 3.7721887763508512e-06, - "loss": 0.1256, - "step": 539 - }, - { - "epoch": 3.312883435582822, - "grad_norm": 3.9265615940093994, - "learning_rate": 3.7680385494486053e-06, - "loss": 0.3845, - "step": 540 - }, - { - "epoch": 3.3190184049079754, - "grad_norm": 3.5030126571655273, - "learning_rate": 3.7638836121564414e-06, - "loss": 0.2905, - "step": 541 - }, - { - "epoch": 3.3251533742331287, - "grad_norm": 3.6685378551483154, - "learning_rate": 3.7597239799087283e-06, - "loss": 0.3561, - "step": 542 - }, - { - "epoch": 3.331288343558282, - "grad_norm": 3.8484046459198, - "learning_rate": 3.7555596681572736e-06, - "loss": 0.1157, - "step": 543 - }, - { - "epoch": 3.3374233128834354, - "grad_norm": 3.7977402210235596, - "learning_rate": 3.751390692371272e-06, - "loss": 0.3049, - "step": 544 - }, - { - "epoch": 3.3435582822085887, - "grad_norm": 3.4409852027893066, - "learning_rate": 3.7472170680372398e-06, - "loss": 0.1626, - "step": 545 - }, - { - "epoch": 3.3496932515337425, - "grad_norm": 3.801541328430176, - "learning_rate": 3.7430388106589632e-06, - "loss": 0.2414, - "step": 546 - }, - { - "epoch": 3.355828220858896, - "grad_norm": 4.025203704833984, - "learning_rate": 3.738855935757438e-06, - "loss": 0.3441, - "step": 547 - }, - { - "epoch": 3.361963190184049, - "grad_norm": 4.242798805236816, - "learning_rate": 3.7346684588708135e-06, - "loss": 0.5244, - "step": 548 - }, - { - "epoch": 3.3680981595092025, - "grad_norm": 3.0516819953918457, - "learning_rate": 3.7304763955543332e-06, - "loss": 0.1984, - "step": 549 - }, - { - "epoch": 3.374233128834356, - "grad_norm": 3.894667625427246, - "learning_rate": 3.726279761380279e-06, - "loss": 0.2715, - "step": 550 - }, - { - "epoch": 3.3803680981595092, - "grad_norm": 3.171208143234253, - "learning_rate": 3.72207857193791e-06, - "loss": 0.1537, - "step": 551 - }, - { - "epoch": 3.3865030674846626, - "grad_norm": 4.344860553741455, - "learning_rate": 3.7178728428334092e-06, - "loss": 0.2388, - "step": 552 - }, - { - "epoch": 3.392638036809816, - "grad_norm": 2.766317367553711, - "learning_rate": 3.7136625896898226e-06, - "loss": 0.1726, - "step": 553 - }, - { - "epoch": 3.3987730061349692, - "grad_norm": 3.550662040710449, - "learning_rate": 3.7094478281470003e-06, - "loss": 0.2942, - "step": 554 - }, - { - "epoch": 3.4049079754601226, - "grad_norm": 3.4576945304870605, - "learning_rate": 3.7052285738615412e-06, - "loss": 0.1665, - "step": 555 - }, - { - "epoch": 3.411042944785276, - "grad_norm": 4.026793003082275, - "learning_rate": 3.7010048425067317e-06, - "loss": 0.3954, - "step": 556 - }, - { - "epoch": 3.4171779141104293, - "grad_norm": 4.600133419036865, - "learning_rate": 3.696776649772492e-06, - "loss": 0.3207, - "step": 557 - }, - { - "epoch": 3.4233128834355826, - "grad_norm": 4.747331142425537, - "learning_rate": 3.692544011365312e-06, - "loss": 0.1325, - "step": 558 - }, - { - "epoch": 3.4294478527607364, - "grad_norm": 3.781464099884033, - "learning_rate": 3.6883069430081986e-06, - "loss": 0.1644, - "step": 559 - }, - { - "epoch": 3.4355828220858897, - "grad_norm": 2.905986785888672, - "learning_rate": 3.6840654604406135e-06, - "loss": 0.2469, - "step": 560 - }, - { - "epoch": 3.441717791411043, - "grad_norm": 2.3747711181640625, - "learning_rate": 3.679819579418414e-06, - "loss": 0.1146, - "step": 561 - }, - { - "epoch": 3.4478527607361964, - "grad_norm": 3.2683632373809814, - "learning_rate": 3.6755693157137995e-06, - "loss": 0.3236, - "step": 562 - }, - { - "epoch": 3.4539877300613497, - "grad_norm": 3.7750496864318848, - "learning_rate": 3.6713146851152487e-06, - "loss": 0.399, - "step": 563 - }, - { - "epoch": 3.460122699386503, - "grad_norm": 3.3912384510040283, - "learning_rate": 3.667055703427461e-06, - "loss": 0.1259, - "step": 564 - }, - { - "epoch": 3.4662576687116564, - "grad_norm": 3.0224430561065674, - "learning_rate": 3.6627923864713e-06, - "loss": 0.1835, - "step": 565 - }, - { - "epoch": 3.4723926380368098, - "grad_norm": 3.642258405685425, - "learning_rate": 3.658524750083733e-06, - "loss": 0.2763, - "step": 566 - }, - { - "epoch": 3.478527607361963, - "grad_norm": 3.409890651702881, - "learning_rate": 3.654252810117773e-06, - "loss": 0.2496, - "step": 567 - }, - { - "epoch": 3.4846625766871164, - "grad_norm": 3.0416476726531982, - "learning_rate": 3.6499765824424195e-06, - "loss": 0.1287, - "step": 568 - }, - { - "epoch": 3.4907975460122698, - "grad_norm": 3.1963987350463867, - "learning_rate": 3.6456960829425987e-06, - "loss": 0.1747, - "step": 569 - }, - { - "epoch": 3.4969325153374236, - "grad_norm": 3.198448657989502, - "learning_rate": 3.641411327519107e-06, - "loss": 0.1913, - "step": 570 - }, - { - "epoch": 3.5030674846625764, - "grad_norm": 3.7023441791534424, - "learning_rate": 3.6371223320885492e-06, - "loss": 0.3224, - "step": 571 - }, - { - "epoch": 3.5092024539877302, - "grad_norm": 4.54288387298584, - "learning_rate": 3.6328291125832803e-06, - "loss": 0.2364, - "step": 572 - }, - { - "epoch": 3.5153374233128836, - "grad_norm": 3.5064890384674072, - "learning_rate": 3.628531684951347e-06, - "loss": 0.2552, - "step": 573 - }, - { - "epoch": 3.521472392638037, - "grad_norm": 3.987583875656128, - "learning_rate": 3.6242300651564276e-06, - "loss": 0.3232, - "step": 574 - }, - { - "epoch": 3.5276073619631902, - "grad_norm": 3.179642915725708, - "learning_rate": 3.6199242691777745e-06, - "loss": 0.32, - "step": 575 - }, - { - "epoch": 3.5337423312883436, - "grad_norm": 3.3078157901763916, - "learning_rate": 3.6156143130101516e-06, - "loss": 0.2922, - "step": 576 - }, - { - "epoch": 3.539877300613497, - "grad_norm": 3.1628613471984863, - "learning_rate": 3.6113002126637765e-06, - "loss": 0.2005, - "step": 577 - }, - { - "epoch": 3.5460122699386503, - "grad_norm": 3.4515540599823, - "learning_rate": 3.606981984164263e-06, - "loss": 0.2138, - "step": 578 - }, - { - "epoch": 3.5521472392638036, - "grad_norm": 5.132473945617676, - "learning_rate": 3.6026596435525578e-06, - "loss": 0.4382, - "step": 579 - }, - { - "epoch": 3.558282208588957, - "grad_norm": 3.397614002227783, - "learning_rate": 3.5983332068848855e-06, - "loss": 0.3326, - "step": 580 - }, - { - "epoch": 3.5644171779141103, - "grad_norm": 4.79497766494751, - "learning_rate": 3.5940026902326825e-06, - "loss": 0.4748, - "step": 581 - }, - { - "epoch": 3.5705521472392636, - "grad_norm": 3.7675018310546875, - "learning_rate": 3.5896681096825446e-06, - "loss": 0.2692, - "step": 582 - }, - { - "epoch": 3.5766871165644174, - "grad_norm": 3.0637521743774414, - "learning_rate": 3.5853294813361614e-06, - "loss": 0.3658, - "step": 583 - }, - { - "epoch": 3.5828220858895703, - "grad_norm": 2.8949790000915527, - "learning_rate": 3.5809868213102623e-06, - "loss": 0.1661, - "step": 584 - }, - { - "epoch": 3.588957055214724, - "grad_norm": 3.163419246673584, - "learning_rate": 3.5766401457365485e-06, - "loss": 0.1233, - "step": 585 - }, - { - "epoch": 3.5950920245398774, - "grad_norm": 3.1787965297698975, - "learning_rate": 3.5722894707616417e-06, - "loss": 0.278, - "step": 586 - }, - { - "epoch": 3.6012269938650308, - "grad_norm": 2.9397857189178467, - "learning_rate": 3.5679348125470175e-06, - "loss": 0.1541, - "step": 587 - }, - { - "epoch": 3.607361963190184, - "grad_norm": 3.2690396308898926, - "learning_rate": 3.56357618726895e-06, - "loss": 0.1575, - "step": 588 - }, - { - "epoch": 3.6134969325153374, - "grad_norm": 5.444014072418213, - "learning_rate": 3.5592136111184483e-06, - "loss": 0.8079, - "step": 589 - }, - { - "epoch": 3.6196319018404908, - "grad_norm": 3.1688313484191895, - "learning_rate": 3.554847100301199e-06, - "loss": 0.341, - "step": 590 - }, - { - "epoch": 3.625766871165644, - "grad_norm": 2.469212532043457, - "learning_rate": 3.550476671037505e-06, - "loss": 0.1625, - "step": 591 - }, - { - "epoch": 3.6319018404907975, - "grad_norm": 3.3956527709960938, - "learning_rate": 3.546102339562223e-06, - "loss": 0.199, - "step": 592 - }, - { - "epoch": 3.638036809815951, - "grad_norm": 2.7287702560424805, - "learning_rate": 3.5417241221247078e-06, - "loss": 0.1493, - "step": 593 - }, - { - "epoch": 3.644171779141104, - "grad_norm": 3.5046865940093994, - "learning_rate": 3.5373420349887477e-06, - "loss": 0.2765, - "step": 594 - }, - { - "epoch": 3.6503067484662575, - "grad_norm": 3.121476650238037, - "learning_rate": 3.5329560944325065e-06, - "loss": 0.2833, - "step": 595 - }, - { - "epoch": 3.6564417177914113, - "grad_norm": 3.276463270187378, - "learning_rate": 3.528566316748462e-06, - "loss": 0.1237, - "step": 596 - }, - { - "epoch": 3.662576687116564, - "grad_norm": 3.382840633392334, - "learning_rate": 3.524172718243347e-06, - "loss": 0.1599, - "step": 597 - }, - { - "epoch": 3.668711656441718, - "grad_norm": 4.801311492919922, - "learning_rate": 3.5197753152380854e-06, - "loss": 0.2997, - "step": 598 - }, - { - "epoch": 3.6748466257668713, - "grad_norm": 4.117336273193359, - "learning_rate": 3.515374124067736e-06, - "loss": 0.2021, - "step": 599 - }, - { - "epoch": 3.6809815950920246, - "grad_norm": 3.611438035964966, - "learning_rate": 3.5109691610814263e-06, - "loss": 0.1726, - "step": 600 - }, - { - "epoch": 3.687116564417178, - "grad_norm": 4.5179972648620605, - "learning_rate": 3.5065604426422995e-06, - "loss": 0.1377, - "step": 601 - }, - { - "epoch": 3.6932515337423313, - "grad_norm": 3.561061382293701, - "learning_rate": 3.502147985127445e-06, - "loss": 0.1497, - "step": 602 - }, - { - "epoch": 3.6993865030674846, - "grad_norm": 3.3497917652130127, - "learning_rate": 3.4977318049278443e-06, - "loss": 0.1589, - "step": 603 - }, - { - "epoch": 3.705521472392638, - "grad_norm": 3.2725470066070557, - "learning_rate": 3.4933119184483065e-06, - "loss": 0.1364, - "step": 604 - }, - { - "epoch": 3.7116564417177913, - "grad_norm": 3.228956460952759, - "learning_rate": 3.4888883421074076e-06, - "loss": 0.177, - "step": 605 - }, - { - "epoch": 3.7177914110429446, - "grad_norm": 3.7648911476135254, - "learning_rate": 3.484461092337434e-06, - "loss": 0.122, - "step": 606 - }, - { - "epoch": 3.7239263803680984, - "grad_norm": 3.5322585105895996, - "learning_rate": 3.4800301855843137e-06, - "loss": 0.2664, - "step": 607 - }, - { - "epoch": 3.7300613496932513, - "grad_norm": 2.951073169708252, - "learning_rate": 3.4755956383075613e-06, - "loss": 0.12, - "step": 608 - }, - { - "epoch": 3.736196319018405, - "grad_norm": 3.0577664375305176, - "learning_rate": 3.471157466980214e-06, - "loss": 0.3926, - "step": 609 - }, - { - "epoch": 3.7423312883435584, - "grad_norm": 4.089846134185791, - "learning_rate": 3.466715688088772e-06, - "loss": 0.6233, - "step": 610 - }, - { - "epoch": 3.7484662576687118, - "grad_norm": 3.081340789794922, - "learning_rate": 3.462270318133136e-06, - "loss": 0.2456, - "step": 611 - }, - { - "epoch": 3.754601226993865, - "grad_norm": 3.034712553024292, - "learning_rate": 3.4578213736265474e-06, - "loss": 0.2683, - "step": 612 - }, - { - "epoch": 3.7607361963190185, - "grad_norm": 3.459815740585327, - "learning_rate": 3.4533688710955255e-06, - "loss": 0.3796, - "step": 613 - }, - { - "epoch": 3.766871165644172, - "grad_norm": 3.523737907409668, - "learning_rate": 3.448912827079805e-06, - "loss": 0.3326, - "step": 614 - }, - { - "epoch": 3.773006134969325, - "grad_norm": 3.333219289779663, - "learning_rate": 3.4444532581322793e-06, - "loss": 0.206, - "step": 615 - }, - { - "epoch": 3.7791411042944785, - "grad_norm": 3.582387685775757, - "learning_rate": 3.4399901808189327e-06, - "loss": 0.244, - "step": 616 - }, - { - "epoch": 3.785276073619632, - "grad_norm": 3.4887266159057617, - "learning_rate": 3.435523611718785e-06, - "loss": 0.1796, - "step": 617 - }, - { - "epoch": 3.791411042944785, - "grad_norm": 4.89408016204834, - "learning_rate": 3.4310535674238242e-06, - "loss": 0.188, - "step": 618 - }, - { - "epoch": 3.7975460122699385, - "grad_norm": 4.338910102844238, - "learning_rate": 3.42658006453895e-06, - "loss": 0.3039, - "step": 619 - }, - { - "epoch": 3.8036809815950923, - "grad_norm": 4.107708930969238, - "learning_rate": 3.4221031196819083e-06, - "loss": 0.3383, - "step": 620 - }, - { - "epoch": 3.809815950920245, - "grad_norm": 3.698777675628662, - "learning_rate": 3.4176227494832305e-06, - "loss": 0.1721, - "step": 621 - }, - { - "epoch": 3.815950920245399, - "grad_norm": 2.6659226417541504, - "learning_rate": 3.413138970586174e-06, - "loss": 0.2211, - "step": 622 - }, - { - "epoch": 3.8220858895705523, - "grad_norm": 3.2398436069488525, - "learning_rate": 3.4086517996466574e-06, - "loss": 0.1871, - "step": 623 - }, - { - "epoch": 3.8282208588957056, - "grad_norm": 4.9128804206848145, - "learning_rate": 3.404161253333199e-06, - "loss": 0.3874, - "step": 624 - }, - { - "epoch": 3.834355828220859, - "grad_norm": 3.508789300918579, - "learning_rate": 3.3996673483268573e-06, - "loss": 0.1739, - "step": 625 - }, - { - "epoch": 3.8404907975460123, - "grad_norm": 3.3016927242279053, - "learning_rate": 3.3951701013211665e-06, - "loss": 0.274, - "step": 626 - }, - { - "epoch": 3.8466257668711656, - "grad_norm": 3.8941333293914795, - "learning_rate": 3.3906695290220736e-06, - "loss": 0.3568, - "step": 627 - }, - { - "epoch": 3.852760736196319, - "grad_norm": 3.512354850769043, - "learning_rate": 3.3861656481478816e-06, - "loss": 0.157, - "step": 628 - }, - { - "epoch": 3.8588957055214723, - "grad_norm": 3.482649326324463, - "learning_rate": 3.3816584754291814e-06, - "loss": 0.1218, - "step": 629 - }, - { - "epoch": 3.8650306748466257, - "grad_norm": 3.1490275859832764, - "learning_rate": 3.377148027608793e-06, - "loss": 0.2234, - "step": 630 - }, - { - "epoch": 3.871165644171779, - "grad_norm": 3.2172653675079346, - "learning_rate": 3.3726343214417023e-06, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 3.8773006134969323, - "grad_norm": 4.167707443237305, - "learning_rate": 3.3681173736949984e-06, - "loss": 0.1384, - "step": 632 - }, - { - "epoch": 3.883435582822086, - "grad_norm": 3.4743919372558594, - "learning_rate": 3.3635972011478134e-06, - "loss": 0.3807, - "step": 633 - }, - { - "epoch": 3.889570552147239, - "grad_norm": 3.6892173290252686, - "learning_rate": 3.3590738205912566e-06, - "loss": 0.194, - "step": 634 - }, - { - "epoch": 3.895705521472393, - "grad_norm": 3.262967824935913, - "learning_rate": 3.354547248828356e-06, - "loss": 0.202, - "step": 635 - }, - { - "epoch": 3.901840490797546, - "grad_norm": 3.8871562480926514, - "learning_rate": 3.3500175026739916e-06, - "loss": 0.2471, - "step": 636 - }, - { - "epoch": 3.9079754601226995, - "grad_norm": 3.5097084045410156, - "learning_rate": 3.3454845989548385e-06, - "loss": 0.1112, - "step": 637 - }, - { - "epoch": 3.914110429447853, - "grad_norm": 4.163944721221924, - "learning_rate": 3.3409485545092995e-06, - "loss": 0.3368, - "step": 638 - }, - { - "epoch": 3.920245398773006, - "grad_norm": 3.6405045986175537, - "learning_rate": 3.336409386187444e-06, - "loss": 0.1863, - "step": 639 - }, - { - "epoch": 3.9263803680981595, - "grad_norm": 3.2477526664733887, - "learning_rate": 3.331867110850946e-06, - "loss": 0.1491, - "step": 640 - }, - { - "epoch": 3.932515337423313, - "grad_norm": 3.933753490447998, - "learning_rate": 3.327321745373021e-06, - "loss": 0.2484, - "step": 641 - }, - { - "epoch": 3.938650306748466, - "grad_norm": 3.2475059032440186, - "learning_rate": 3.322773306638364e-06, - "loss": 0.2126, - "step": 642 - }, - { - "epoch": 3.9447852760736195, - "grad_norm": 2.628467321395874, - "learning_rate": 3.318221811543086e-06, - "loss": 0.1649, - "step": 643 - }, - { - "epoch": 3.950920245398773, - "grad_norm": 3.2612411975860596, - "learning_rate": 3.313667276994651e-06, - "loss": 0.1442, - "step": 644 - }, - { - "epoch": 3.957055214723926, - "grad_norm": 3.8058395385742188, - "learning_rate": 3.309109719911814e-06, - "loss": 0.359, - "step": 645 - }, - { - "epoch": 3.96319018404908, - "grad_norm": 3.3450071811676025, - "learning_rate": 3.304549157224558e-06, - "loss": 0.4042, - "step": 646 - }, - { - "epoch": 3.969325153374233, - "grad_norm": 3.079601287841797, - "learning_rate": 3.299985605874031e-06, - "loss": 0.1699, - "step": 647 - }, - { - "epoch": 3.9754601226993866, - "grad_norm": 3.8963980674743652, - "learning_rate": 3.295419082812483e-06, - "loss": 0.1888, - "step": 648 - }, - { - "epoch": 3.98159509202454, - "grad_norm": 3.307405948638916, - "learning_rate": 3.2908496050032024e-06, - "loss": 0.2824, - "step": 649 - }, - { - "epoch": 3.9877300613496933, - "grad_norm": 3.227478265762329, - "learning_rate": 3.2862771894204544e-06, - "loss": 0.3038, - "step": 650 - }, - { - "epoch": 3.9938650306748467, - "grad_norm": 4.046506881713867, - "learning_rate": 3.2817018530494164e-06, - "loss": 0.3266, - "step": 651 - }, - { - "epoch": 4.0, - "grad_norm": 7.775874614715576, - "learning_rate": 3.277123612886116e-06, - "loss": 0.2998, - "step": 652 - }, - { - "epoch": 4.006134969325154, - "grad_norm": 3.146462917327881, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2764, - "step": 653 - }, - { - "epoch": 4.012269938650307, - "grad_norm": 3.0539863109588623, - "learning_rate": 3.2679584892207118e-06, - "loss": 0.1157, - "step": 654 - }, - { - "epoch": 4.0184049079754605, - "grad_norm": 3.634021520614624, - "learning_rate": 3.263371639764343e-06, - "loss": 0.0707, - "step": 655 - }, - { - "epoch": 4.024539877300613, - "grad_norm": 3.3474650382995605, - "learning_rate": 3.2587819546070596e-06, - "loss": 0.1067, - "step": 656 - }, - { - "epoch": 4.030674846625767, - "grad_norm": 4.409244537353516, - "learning_rate": 3.254189450798189e-06, - "loss": 0.0564, - "step": 657 - }, - { - "epoch": 4.03680981595092, - "grad_norm": 3.0446252822875977, - "learning_rate": 3.2495941453975312e-06, - "loss": 0.0535, - "step": 658 - }, - { - "epoch": 4.042944785276074, - "grad_norm": 4.014753818511963, - "learning_rate": 3.2449960554752935e-06, - "loss": 0.1245, - "step": 659 - }, - { - "epoch": 4.049079754601227, - "grad_norm": 3.188062906265259, - "learning_rate": 3.240395198112026e-06, - "loss": 0.0626, - "step": 660 - }, - { - "epoch": 4.0552147239263805, - "grad_norm": 3.006086826324463, - "learning_rate": 3.2357915903985605e-06, - "loss": 0.1198, - "step": 661 - }, - { - "epoch": 4.061349693251533, - "grad_norm": 2.8865551948547363, - "learning_rate": 3.2311852494359423e-06, - "loss": 0.0454, - "step": 662 - }, - { - "epoch": 4.067484662576687, - "grad_norm": 4.2888007164001465, - "learning_rate": 3.226576192335373e-06, - "loss": 0.2064, - "step": 663 - }, - { - "epoch": 4.07361963190184, - "grad_norm": 3.1414525508880615, - "learning_rate": 3.2219644362181436e-06, - "loss": 0.2183, - "step": 664 - }, - { - "epoch": 4.079754601226994, - "grad_norm": 2.556277275085449, - "learning_rate": 3.21734999821557e-06, - "loss": 0.0516, - "step": 665 - }, - { - "epoch": 4.085889570552148, - "grad_norm": 2.698118209838867, - "learning_rate": 3.2127328954689307e-06, - "loss": 0.0613, - "step": 666 - }, - { - "epoch": 4.0920245398773005, - "grad_norm": 2.869919538497925, - "learning_rate": 3.2081131451294025e-06, - "loss": 0.0583, - "step": 667 - }, - { - "epoch": 4.098159509202454, - "grad_norm": 3.8786919116973877, - "learning_rate": 3.2034907643579988e-06, - "loss": 0.0766, - "step": 668 - }, - { - "epoch": 4.104294478527607, - "grad_norm": 4.224637031555176, - "learning_rate": 3.1988657703255043e-06, - "loss": 0.1099, - "step": 669 - }, - { - "epoch": 4.110429447852761, - "grad_norm": 4.671669006347656, - "learning_rate": 3.194238180212409e-06, - "loss": 0.1663, - "step": 670 - }, - { - "epoch": 4.116564417177914, - "grad_norm": 3.2484257221221924, - "learning_rate": 3.1896080112088477e-06, - "loss": 0.0587, - "step": 671 - }, - { - "epoch": 4.122699386503068, - "grad_norm": 2.4808075428009033, - "learning_rate": 3.184975280514536e-06, - "loss": 0.0579, - "step": 672 - }, - { - "epoch": 4.128834355828221, - "grad_norm": 3.7106919288635254, - "learning_rate": 3.1803400053387044e-06, - "loss": 0.1083, - "step": 673 - }, - { - "epoch": 4.134969325153374, - "grad_norm": 3.008970260620117, - "learning_rate": 3.175702202900036e-06, - "loss": 0.1355, - "step": 674 - }, - { - "epoch": 4.141104294478527, - "grad_norm": 3.2640793323516846, - "learning_rate": 3.1710618904266006e-06, - "loss": 0.092, - "step": 675 - }, - { - "epoch": 4.147239263803681, - "grad_norm": 3.08042049407959, - "learning_rate": 3.166419085155793e-06, - "loss": 0.0563, - "step": 676 - }, - { - "epoch": 4.153374233128835, - "grad_norm": 2.993530511856079, - "learning_rate": 3.1617738043342695e-06, - "loss": 0.1773, - "step": 677 - }, - { - "epoch": 4.159509202453988, - "grad_norm": 2.6218204498291016, - "learning_rate": 3.157126065217879e-06, - "loss": 0.0489, - "step": 678 - }, - { - "epoch": 4.1656441717791415, - "grad_norm": 4.3173723220825195, - "learning_rate": 3.152475885071606e-06, - "loss": 0.1333, - "step": 679 - }, - { - "epoch": 4.171779141104294, - "grad_norm": 3.659149408340454, - "learning_rate": 3.147823281169498e-06, - "loss": 0.1501, - "step": 680 - }, - { - "epoch": 4.177914110429448, - "grad_norm": 3.0953338146209717, - "learning_rate": 3.143168270794612e-06, - "loss": 0.1067, - "step": 681 - }, - { - "epoch": 4.184049079754601, - "grad_norm": 3.5693907737731934, - "learning_rate": 3.1385108712389394e-06, - "loss": 0.2499, - "step": 682 - }, - { - "epoch": 4.190184049079755, - "grad_norm": 3.3022868633270264, - "learning_rate": 3.1338510998033484e-06, - "loss": 0.1748, - "step": 683 - }, - { - "epoch": 4.196319018404908, - "grad_norm": 3.7468113899230957, - "learning_rate": 3.129188973797519e-06, - "loss": 0.201, - "step": 684 - }, - { - "epoch": 4.2024539877300615, - "grad_norm": 2.8381078243255615, - "learning_rate": 3.124524510539875e-06, - "loss": 0.0735, - "step": 685 - }, - { - "epoch": 4.208588957055214, - "grad_norm": 2.84706974029541, - "learning_rate": 3.119857727357527e-06, - "loss": 0.1806, - "step": 686 - }, - { - "epoch": 4.214723926380368, - "grad_norm": 3.8130292892456055, - "learning_rate": 3.1151886415861993e-06, - "loss": 0.1811, - "step": 687 - }, - { - "epoch": 4.220858895705521, - "grad_norm": 3.528895378112793, - "learning_rate": 3.1105172705701708e-06, - "loss": 0.1634, - "step": 688 - }, - { - "epoch": 4.226993865030675, - "grad_norm": 5.028727054595947, - "learning_rate": 3.1058436316622103e-06, - "loss": 0.1625, - "step": 689 - }, - { - "epoch": 4.233128834355828, - "grad_norm": 4.606889247894287, - "learning_rate": 3.1011677422235093e-06, - "loss": 0.1791, - "step": 690 - }, - { - "epoch": 4.2392638036809815, - "grad_norm": 3.3620636463165283, - "learning_rate": 3.0964896196236217e-06, - "loss": 0.2233, - "step": 691 - }, - { - "epoch": 4.245398773006135, - "grad_norm": 3.7845852375030518, - "learning_rate": 3.0918092812403954e-06, - "loss": 0.1142, - "step": 692 - }, - { - "epoch": 4.251533742331288, - "grad_norm": 3.1204118728637695, - "learning_rate": 3.0871267444599098e-06, - "loss": 0.096, - "step": 693 - }, - { - "epoch": 4.257668711656442, - "grad_norm": 3.686067819595337, - "learning_rate": 3.0824420266764093e-06, - "loss": 0.2749, - "step": 694 - }, - { - "epoch": 4.263803680981595, - "grad_norm": 3.1680829524993896, - "learning_rate": 3.077755145292243e-06, - "loss": 0.2504, - "step": 695 - }, - { - "epoch": 4.269938650306749, - "grad_norm": 3.3179469108581543, - "learning_rate": 3.0730661177177957e-06, - "loss": 0.1324, - "step": 696 - }, - { - "epoch": 4.276073619631902, - "grad_norm": 3.1186370849609375, - "learning_rate": 3.0683749613714238e-06, - "loss": 0.0691, - "step": 697 - }, - { - "epoch": 4.282208588957055, - "grad_norm": 3.086834192276001, - "learning_rate": 3.063681693679391e-06, - "loss": 0.1026, - "step": 698 - }, - { - "epoch": 4.288343558282208, - "grad_norm": 4.629584312438965, - "learning_rate": 3.0589863320758063e-06, - "loss": 0.2646, - "step": 699 - }, - { - "epoch": 4.294478527607362, - "grad_norm": 3.9641213417053223, - "learning_rate": 3.0542888940025562e-06, - "loss": 0.1711, - "step": 700 - }, - { - "epoch": 4.300613496932515, - "grad_norm": 3.75014328956604, - "learning_rate": 3.0495893969092395e-06, - "loss": 0.0589, - "step": 701 - }, - { - "epoch": 4.306748466257669, - "grad_norm": 3.603290319442749, - "learning_rate": 3.044887858253105e-06, - "loss": 0.2244, - "step": 702 - }, - { - "epoch": 4.3128834355828225, - "grad_norm": 3.79404616355896, - "learning_rate": 3.040184295498984e-06, - "loss": 0.1506, - "step": 703 - }, - { - "epoch": 4.319018404907975, - "grad_norm": 3.0890021324157715, - "learning_rate": 3.035478726119228e-06, - "loss": 0.2343, - "step": 704 - }, - { - "epoch": 4.325153374233129, - "grad_norm": 3.6688191890716553, - "learning_rate": 3.0307711675936426e-06, - "loss": 0.0518, - "step": 705 - }, - { - "epoch": 4.331288343558282, - "grad_norm": 5.1836700439453125, - "learning_rate": 3.0260616374094208e-06, - "loss": 0.2363, - "step": 706 - }, - { - "epoch": 4.337423312883436, - "grad_norm": 2.7123284339904785, - "learning_rate": 3.0213501530610807e-06, - "loss": 0.0848, - "step": 707 - }, - { - "epoch": 4.343558282208589, - "grad_norm": 3.5661890506744385, - "learning_rate": 3.0166367320504005e-06, - "loss": 0.149, - "step": 708 - }, - { - "epoch": 4.3496932515337425, - "grad_norm": 3.6454737186431885, - "learning_rate": 3.0119213918863515e-06, - "loss": 0.1133, - "step": 709 - }, - { - "epoch": 4.355828220858895, - "grad_norm": 3.7534968852996826, - "learning_rate": 3.0072041500850343e-06, - "loss": 0.1358, - "step": 710 - }, - { - "epoch": 4.361963190184049, - "grad_norm": 3.40387225151062, - "learning_rate": 3.0024850241696128e-06, - "loss": 0.0706, - "step": 711 - }, - { - "epoch": 4.368098159509202, - "grad_norm": 3.250471591949463, - "learning_rate": 2.9977640316702512e-06, - "loss": 0.1977, - "step": 712 - }, - { - "epoch": 4.374233128834356, - "grad_norm": 3.417781352996826, - "learning_rate": 2.993041190124047e-06, - "loss": 0.2622, - "step": 713 - }, - { - "epoch": 4.38036809815951, - "grad_norm": 2.628434181213379, - "learning_rate": 2.9883165170749657e-06, - "loss": 0.1487, - "step": 714 - }, - { - "epoch": 4.386503067484663, - "grad_norm": 3.240264892578125, - "learning_rate": 2.9835900300737763e-06, - "loss": 0.0822, - "step": 715 - }, - { - "epoch": 4.392638036809816, - "grad_norm": 6.575517177581787, - "learning_rate": 2.9788617466779884e-06, - "loss": 0.3668, - "step": 716 - }, - { - "epoch": 4.398773006134969, - "grad_norm": 4.699089050292969, - "learning_rate": 2.974131684451781e-06, - "loss": 0.2432, - "step": 717 - }, - { - "epoch": 4.404907975460123, - "grad_norm": 2.9815752506256104, - "learning_rate": 2.9693998609659443e-06, - "loss": 0.0689, - "step": 718 - }, - { - "epoch": 4.411042944785276, - "grad_norm": 4.192755222320557, - "learning_rate": 2.9646662937978082e-06, - "loss": 0.1897, - "step": 719 - }, - { - "epoch": 4.41717791411043, - "grad_norm": 2.9729068279266357, - "learning_rate": 2.9599310005311824e-06, - "loss": 0.0457, - "step": 720 - }, - { - "epoch": 4.423312883435583, - "grad_norm": 4.234438896179199, - "learning_rate": 2.9551939987562866e-06, - "loss": 0.2307, - "step": 721 - }, - { - "epoch": 4.429447852760736, - "grad_norm": 3.3982434272766113, - "learning_rate": 2.950455306069688e-06, - "loss": 0.0637, - "step": 722 - }, - { - "epoch": 4.435582822085889, - "grad_norm": 4.539764404296875, - "learning_rate": 2.9457149400742357e-06, - "loss": 0.1924, - "step": 723 - }, - { - "epoch": 4.441717791411043, - "grad_norm": 4.039684772491455, - "learning_rate": 2.940972918378993e-06, - "loss": 0.1275, - "step": 724 - }, - { - "epoch": 4.447852760736196, - "grad_norm": 4.340360641479492, - "learning_rate": 2.936229258599174e-06, - "loss": 0.123, - "step": 725 - }, - { - "epoch": 4.45398773006135, - "grad_norm": 2.8720109462738037, - "learning_rate": 2.93148397835608e-06, - "loss": 0.0555, - "step": 726 - }, - { - "epoch": 4.460122699386503, - "grad_norm": 4.227811336517334, - "learning_rate": 2.926737095277029e-06, - "loss": 0.0991, - "step": 727 - }, - { - "epoch": 4.466257668711656, - "grad_norm": 2.8079142570495605, - "learning_rate": 2.921988626995295e-06, - "loss": 0.0628, - "step": 728 - }, - { - "epoch": 4.47239263803681, - "grad_norm": 4.195122241973877, - "learning_rate": 2.9172385911500385e-06, - "loss": 0.2333, - "step": 729 - }, - { - "epoch": 4.478527607361963, - "grad_norm": 3.223794460296631, - "learning_rate": 2.9124870053862447e-06, - "loss": 0.1317, - "step": 730 - }, - { - "epoch": 4.484662576687117, - "grad_norm": 3.5533759593963623, - "learning_rate": 2.907733887354657e-06, - "loss": 0.2285, - "step": 731 - }, - { - "epoch": 4.49079754601227, - "grad_norm": 3.535673141479492, - "learning_rate": 2.9029792547117088e-06, - "loss": 0.096, - "step": 732 - }, - { - "epoch": 4.4969325153374236, - "grad_norm": 4.031703948974609, - "learning_rate": 2.898223125119461e-06, - "loss": 0.1505, - "step": 733 - }, - { - "epoch": 4.5030674846625764, - "grad_norm": 2.823413610458374, - "learning_rate": 2.893465516245534e-06, - "loss": 0.0327, - "step": 734 - }, - { - "epoch": 4.50920245398773, - "grad_norm": 3.516738176345825, - "learning_rate": 2.8887064457630453e-06, - "loss": 0.0743, - "step": 735 - }, - { - "epoch": 4.515337423312883, - "grad_norm": 3.5523500442504883, - "learning_rate": 2.8839459313505407e-06, - "loss": 0.1768, - "step": 736 - }, - { - "epoch": 4.521472392638037, - "grad_norm": 3.2433223724365234, - "learning_rate": 2.879183990691929e-06, - "loss": 0.1598, - "step": 737 - }, - { - "epoch": 4.52760736196319, - "grad_norm": 3.0156848430633545, - "learning_rate": 2.8744206414764185e-06, - "loss": 0.0829, - "step": 738 - }, - { - "epoch": 4.533742331288344, - "grad_norm": 4.359529495239258, - "learning_rate": 2.8696559013984488e-06, - "loss": 0.1169, - "step": 739 - }, - { - "epoch": 4.539877300613497, - "grad_norm": 2.3862433433532715, - "learning_rate": 2.8648897881576274e-06, - "loss": 0.0962, - "step": 740 - }, - { - "epoch": 4.54601226993865, - "grad_norm": 2.7100136280059814, - "learning_rate": 2.8601223194586613e-06, - "loss": 0.1204, - "step": 741 - }, - { - "epoch": 4.552147239263804, - "grad_norm": 3.8116140365600586, - "learning_rate": 2.8553535130112935e-06, - "loss": 0.0685, - "step": 742 - }, - { - "epoch": 4.558282208588957, - "grad_norm": 2.9640142917633057, - "learning_rate": 2.850583386530235e-06, - "loss": 0.0692, - "step": 743 - }, - { - "epoch": 4.564417177914111, - "grad_norm": 3.264592170715332, - "learning_rate": 2.8458119577351035e-06, - "loss": 0.2128, - "step": 744 - }, - { - "epoch": 4.570552147239264, - "grad_norm": 3.230497360229492, - "learning_rate": 2.841039244350351e-06, - "loss": 0.2409, - "step": 745 - }, - { - "epoch": 4.576687116564417, - "grad_norm": 4.41513204574585, - "learning_rate": 2.8362652641052024e-06, - "loss": 0.1878, - "step": 746 - }, - { - "epoch": 4.58282208588957, - "grad_norm": 3.047248601913452, - "learning_rate": 2.83149003473359e-06, - "loss": 0.1303, - "step": 747 - }, - { - "epoch": 4.588957055214724, - "grad_norm": 2.399754047393799, - "learning_rate": 2.8267135739740836e-06, - "loss": 0.0577, - "step": 748 - }, - { - "epoch": 4.595092024539877, - "grad_norm": 4.608038425445557, - "learning_rate": 2.8219358995698307e-06, - "loss": 0.2329, - "step": 749 - }, - { - "epoch": 4.601226993865031, - "grad_norm": 3.537644147872925, - "learning_rate": 2.8171570292684846e-06, - "loss": 0.1329, - "step": 750 - }, - { - "epoch": 4.6073619631901845, - "grad_norm": 2.8099827766418457, - "learning_rate": 2.8123769808221407e-06, - "loss": 0.1512, - "step": 751 - }, - { - "epoch": 4.613496932515337, - "grad_norm": 3.3169758319854736, - "learning_rate": 2.8075957719872724e-06, - "loss": 0.1267, - "step": 752 - }, - { - "epoch": 4.61963190184049, - "grad_norm": 3.578435182571411, - "learning_rate": 2.8028134205246633e-06, - "loss": 0.147, - "step": 753 - }, - { - "epoch": 4.625766871165644, - "grad_norm": 3.544437885284424, - "learning_rate": 2.7980299441993415e-06, - "loss": 0.0947, - "step": 754 - }, - { - "epoch": 4.631901840490798, - "grad_norm": 3.798776388168335, - "learning_rate": 2.793245360780512e-06, - "loss": 0.1498, - "step": 755 - }, - { - "epoch": 4.638036809815951, - "grad_norm": 3.634991407394409, - "learning_rate": 2.788459688041495e-06, - "loss": 0.2504, - "step": 756 - }, - { - "epoch": 4.644171779141105, - "grad_norm": 20.123680114746094, - "learning_rate": 2.783672943759655e-06, - "loss": 0.2091, - "step": 757 - }, - { - "epoch": 4.6503067484662575, - "grad_norm": 3.9357221126556396, - "learning_rate": 2.778885145716339e-06, - "loss": 0.2045, - "step": 758 - }, - { - "epoch": 4.656441717791411, - "grad_norm": 3.3035309314727783, - "learning_rate": 2.7740963116968063e-06, - "loss": 0.1416, - "step": 759 - }, - { - "epoch": 4.662576687116564, - "grad_norm": 3.096985101699829, - "learning_rate": 2.7693064594901646e-06, - "loss": 0.0455, - "step": 760 - }, - { - "epoch": 4.668711656441718, - "grad_norm": 2.9855458736419678, - "learning_rate": 2.7645156068893075e-06, - "loss": 0.1496, - "step": 761 - }, - { - "epoch": 4.674846625766871, - "grad_norm": 3.9140093326568604, - "learning_rate": 2.759723771690839e-06, - "loss": 0.2061, - "step": 762 - }, - { - "epoch": 4.680981595092025, - "grad_norm": 3.590569496154785, - "learning_rate": 2.754930971695019e-06, - "loss": 0.1017, - "step": 763 - }, - { - "epoch": 4.6871165644171775, - "grad_norm": 3.527254581451416, - "learning_rate": 2.750137224705687e-06, - "loss": 0.1979, - "step": 764 - }, - { - "epoch": 4.693251533742331, - "grad_norm": 4.198459148406982, - "learning_rate": 2.745342548530202e-06, - "loss": 0.1667, - "step": 765 - }, - { - "epoch": 4.699386503067485, - "grad_norm": 2.0246167182922363, - "learning_rate": 2.7405469609793746e-06, - "loss": 0.0346, - "step": 766 - }, - { - "epoch": 4.705521472392638, - "grad_norm": 3.2045300006866455, - "learning_rate": 2.7357504798674004e-06, - "loss": 0.0596, - "step": 767 - }, - { - "epoch": 4.711656441717792, - "grad_norm": 2.736985921859741, - "learning_rate": 2.730953123011796e-06, - "loss": 0.0384, - "step": 768 - }, - { - "epoch": 4.717791411042945, - "grad_norm": 3.0621395111083984, - "learning_rate": 2.726154908233328e-06, - "loss": 0.0558, - "step": 769 - }, - { - "epoch": 4.723926380368098, - "grad_norm": 3.2280497550964355, - "learning_rate": 2.721355853355953e-06, - "loss": 0.2272, - "step": 770 - }, - { - "epoch": 4.730061349693251, - "grad_norm": 3.342226028442383, - "learning_rate": 2.716555976206748e-06, - "loss": 0.074, - "step": 771 - }, - { - "epoch": 4.736196319018405, - "grad_norm": 4.328624248504639, - "learning_rate": 2.7117552946158415e-06, - "loss": 0.1034, - "step": 772 - }, - { - "epoch": 4.742331288343558, - "grad_norm": 2.980215311050415, - "learning_rate": 2.706953826416353e-06, - "loss": 0.1199, - "step": 773 - }, - { - "epoch": 4.748466257668712, - "grad_norm": 2.622478485107422, - "learning_rate": 2.702151589444324e-06, - "loss": 0.0467, - "step": 774 - }, - { - "epoch": 4.754601226993865, - "grad_norm": 2.9958693981170654, - "learning_rate": 2.6973486015386507e-06, - "loss": 0.143, - "step": 775 - }, - { - "epoch": 4.7607361963190185, - "grad_norm": 4.548511505126953, - "learning_rate": 2.6925448805410197e-06, - "loss": 0.3594, - "step": 776 - }, - { - "epoch": 4.766871165644172, - "grad_norm": 3.3429481983184814, - "learning_rate": 2.6877404442958393e-06, - "loss": 0.1397, - "step": 777 - }, - { - "epoch": 4.773006134969325, - "grad_norm": 2.5820136070251465, - "learning_rate": 2.682935310650177e-06, - "loss": 0.054, - "step": 778 - }, - { - "epoch": 4.779141104294479, - "grad_norm": 4.047626495361328, - "learning_rate": 2.6781294974536886e-06, - "loss": 0.1284, - "step": 779 - }, - { - "epoch": 4.785276073619632, - "grad_norm": 3.0227510929107666, - "learning_rate": 2.673323022558557e-06, - "loss": 0.1441, - "step": 780 - }, - { - "epoch": 4.791411042944786, - "grad_norm": 4.731313705444336, - "learning_rate": 2.6685159038194202e-06, - "loss": 0.2859, - "step": 781 - }, - { - "epoch": 4.7975460122699385, - "grad_norm": 3.880655288696289, - "learning_rate": 2.6637081590933096e-06, - "loss": 0.1524, - "step": 782 - }, - { - "epoch": 4.803680981595092, - "grad_norm": 2.375474452972412, - "learning_rate": 2.6588998062395803e-06, - "loss": 0.0338, - "step": 783 - }, - { - "epoch": 4.809815950920245, - "grad_norm": 3.3587446212768555, - "learning_rate": 2.6540908631198498e-06, - "loss": 0.0755, - "step": 784 - }, - { - "epoch": 4.815950920245399, - "grad_norm": 2.767686367034912, - "learning_rate": 2.6492813475979243e-06, - "loss": 0.0631, - "step": 785 - }, - { - "epoch": 4.822085889570552, - "grad_norm": 3.88670015335083, - "learning_rate": 2.6444712775397397e-06, - "loss": 0.0853, - "step": 786 - }, - { - "epoch": 4.828220858895706, - "grad_norm": 3.543276309967041, - "learning_rate": 2.639660670813288e-06, - "loss": 0.1895, - "step": 787 - }, - { - "epoch": 4.8343558282208585, - "grad_norm": 3.659323215484619, - "learning_rate": 2.6348495452885598e-06, - "loss": 0.1745, - "step": 788 - }, - { - "epoch": 4.840490797546012, - "grad_norm": 3.0955021381378174, - "learning_rate": 2.630037918837468e-06, - "loss": 0.0846, - "step": 789 - }, - { - "epoch": 4.846625766871165, - "grad_norm": 3.4473249912261963, - "learning_rate": 2.6252258093337892e-06, - "loss": 0.0808, - "step": 790 - }, - { - "epoch": 4.852760736196319, - "grad_norm": 3.937120199203491, - "learning_rate": 2.6204132346530936e-06, - "loss": 0.2054, - "step": 791 - }, - { - "epoch": 4.858895705521473, - "grad_norm": 4.052806854248047, - "learning_rate": 2.6156002126726788e-06, - "loss": 0.1679, - "step": 792 - }, - { - "epoch": 4.865030674846626, - "grad_norm": 2.6694889068603516, - "learning_rate": 2.6107867612715043e-06, - "loss": 0.0534, - "step": 793 - }, - { - "epoch": 4.871165644171779, - "grad_norm": 3.594649076461792, - "learning_rate": 2.6059728983301267e-06, - "loss": 0.0899, - "step": 794 - }, - { - "epoch": 4.877300613496932, - "grad_norm": 2.7796030044555664, - "learning_rate": 2.601158641730629e-06, - "loss": 0.0596, - "step": 795 - }, - { - "epoch": 4.883435582822086, - "grad_norm": 4.618961334228516, - "learning_rate": 2.5963440093565567e-06, - "loss": 0.3858, - "step": 796 - }, - { - "epoch": 4.889570552147239, - "grad_norm": 3.0783939361572266, - "learning_rate": 2.5915290190928518e-06, - "loss": 0.12, - "step": 797 - }, - { - "epoch": 4.895705521472393, - "grad_norm": 4.078456878662109, - "learning_rate": 2.586713688825786e-06, - "loss": 0.1278, - "step": 798 - }, - { - "epoch": 4.901840490797546, - "grad_norm": 2.9439120292663574, - "learning_rate": 2.5818980364428935e-06, - "loss": 0.0847, - "step": 799 - }, - { - "epoch": 4.9079754601226995, - "grad_norm": 5.140681743621826, - "learning_rate": 2.5770820798329055e-06, - "loss": 0.1718, - "step": 800 - }, - { - "epoch": 4.914110429447852, - "grad_norm": 3.450190305709839, - "learning_rate": 2.572265836885682e-06, - "loss": 0.0895, - "step": 801 - }, - { - "epoch": 4.920245398773006, - "grad_norm": 3.1145224571228027, - "learning_rate": 2.567449325492149e-06, - "loss": 0.0652, - "step": 802 - }, - { - "epoch": 4.92638036809816, - "grad_norm": 2.851768732070923, - "learning_rate": 2.5626325635442283e-06, - "loss": 0.0877, - "step": 803 - }, - { - "epoch": 4.932515337423313, - "grad_norm": 3.3392980098724365, - "learning_rate": 2.5578155689347716e-06, - "loss": 0.2028, - "step": 804 - }, - { - "epoch": 4.938650306748467, - "grad_norm": 3.012439250946045, - "learning_rate": 2.5529983595574964e-06, - "loss": 0.031, - "step": 805 - }, - { - "epoch": 4.9447852760736195, - "grad_norm": 2.7732717990875244, - "learning_rate": 2.548180953306918e-06, - "loss": 0.0415, - "step": 806 - }, - { - "epoch": 4.950920245398773, - "grad_norm": 3.0423903465270996, - "learning_rate": 2.5433633680782817e-06, - "loss": 0.1188, - "step": 807 - }, - { - "epoch": 4.957055214723926, - "grad_norm": 5.056387901306152, - "learning_rate": 2.538545621767498e-06, - "loss": 0.1703, - "step": 808 - }, - { - "epoch": 4.96319018404908, - "grad_norm": 4.052585124969482, - "learning_rate": 2.533727732271077e-06, - "loss": 0.1455, - "step": 809 - }, - { - "epoch": 4.969325153374233, - "grad_norm": 3.4507904052734375, - "learning_rate": 2.5289097174860593e-06, - "loss": 0.0617, - "step": 810 - }, - { - "epoch": 4.975460122699387, - "grad_norm": 2.908266305923462, - "learning_rate": 2.524091595309952e-06, - "loss": 0.1173, - "step": 811 - }, - { - "epoch": 4.9815950920245395, - "grad_norm": 2.5857458114624023, - "learning_rate": 2.519273383640661e-06, - "loss": 0.0538, - "step": 812 - }, - { - "epoch": 4.987730061349693, - "grad_norm": 3.3518428802490234, - "learning_rate": 2.5144551003764227e-06, - "loss": 0.211, - "step": 813 - }, - { - "epoch": 4.993865030674847, - "grad_norm": 3.137981653213501, - "learning_rate": 2.509636763415742e-06, - "loss": 0.0944, - "step": 814 - }, - { - "epoch": 5.0, - "grad_norm": 2.8854241371154785, - "learning_rate": 2.5048183906573227e-06, - "loss": 0.098, - "step": 815 - }, - { - "epoch": 5.006134969325154, - "grad_norm": 3.508527994155884, - "learning_rate": 2.5e-06, - "loss": 0.1102, - "step": 816 - }, - { - "epoch": 5.012269938650307, - "grad_norm": 2.448152542114258, - "learning_rate": 2.495181609342678e-06, - "loss": 0.0712, - "step": 817 - }, - { - "epoch": 5.0184049079754605, - "grad_norm": 3.105818748474121, - "learning_rate": 2.4903632365842587e-06, - "loss": 0.0414, - "step": 818 - }, - { - "epoch": 5.024539877300613, - "grad_norm": 3.8048601150512695, - "learning_rate": 2.4855448996235777e-06, - "loss": 0.0894, - "step": 819 - }, - { - "epoch": 5.030674846625767, - "grad_norm": 3.259834051132202, - "learning_rate": 2.48072661635934e-06, - "loss": 0.0796, - "step": 820 - }, - { - "epoch": 5.03680981595092, - "grad_norm": 2.822364568710327, - "learning_rate": 2.475908404690049e-06, - "loss": 0.0349, - "step": 821 - }, - { - "epoch": 5.042944785276074, - "grad_norm": 4.78808069229126, - "learning_rate": 2.4710902825139415e-06, - "loss": 0.2529, - "step": 822 - }, - { - "epoch": 5.049079754601227, - "grad_norm": 3.5420572757720947, - "learning_rate": 2.466272267728924e-06, - "loss": 0.1405, - "step": 823 - }, - { - "epoch": 5.0552147239263805, - "grad_norm": 2.500713348388672, - "learning_rate": 2.461454378232503e-06, - "loss": 0.0408, - "step": 824 - }, - { - "epoch": 5.061349693251533, - "grad_norm": 3.266291618347168, - "learning_rate": 2.4566366319217196e-06, - "loss": 0.0338, - "step": 825 - }, - { - "epoch": 5.067484662576687, - "grad_norm": 4.071012020111084, - "learning_rate": 2.4518190466930837e-06, - "loss": 0.06, - "step": 826 - }, - { - "epoch": 5.07361963190184, - "grad_norm": 4.3747172355651855, - "learning_rate": 2.4470016404425045e-06, - "loss": 0.1184, - "step": 827 - }, - { - "epoch": 5.079754601226994, - "grad_norm": 3.92030668258667, - "learning_rate": 2.4421844310652296e-06, - "loss": 0.1369, - "step": 828 - }, - { - "epoch": 5.085889570552148, - "grad_norm": 3.3482303619384766, - "learning_rate": 2.437367436455773e-06, - "loss": 0.1166, - "step": 829 - }, - { - "epoch": 5.0920245398773005, - "grad_norm": 3.429368019104004, - "learning_rate": 2.4325506745078524e-06, - "loss": 0.1214, - "step": 830 - }, - { - "epoch": 5.098159509202454, - "grad_norm": 3.4915647506713867, - "learning_rate": 2.427734163114319e-06, - "loss": 0.0454, - "step": 831 - }, - { - "epoch": 5.104294478527607, - "grad_norm": 3.1721251010894775, - "learning_rate": 2.4229179201670954e-06, - "loss": 0.0431, - "step": 832 - }, - { - "epoch": 5.110429447852761, - "grad_norm": 2.552578926086426, - "learning_rate": 2.418101963557107e-06, - "loss": 0.0347, - "step": 833 - }, - { - "epoch": 5.116564417177914, - "grad_norm": 3.518169403076172, - "learning_rate": 2.413286311174214e-06, - "loss": 0.1555, - "step": 834 - }, - { - "epoch": 5.122699386503068, - "grad_norm": 2.4452908039093018, - "learning_rate": 2.4084709809071487e-06, - "loss": 0.035, - "step": 835 - }, - { - "epoch": 5.128834355828221, - "grad_norm": 3.5366528034210205, - "learning_rate": 2.403655990643444e-06, - "loss": 0.0798, - "step": 836 - }, - { - "epoch": 5.134969325153374, - "grad_norm": 2.300065040588379, - "learning_rate": 2.398841358269371e-06, - "loss": 0.0178, - "step": 837 - }, - { - "epoch": 5.141104294478527, - "grad_norm": 2.851393699645996, - "learning_rate": 2.3940271016698733e-06, - "loss": 0.0447, - "step": 838 - }, - { - "epoch": 5.147239263803681, - "grad_norm": 4.085958957672119, - "learning_rate": 2.3892132387284956e-06, - "loss": 0.1626, - "step": 839 - }, - { - "epoch": 5.153374233128835, - "grad_norm": 3.4240522384643555, - "learning_rate": 2.384399787327322e-06, - "loss": 0.0914, - "step": 840 - }, - { - "epoch": 5.159509202453988, - "grad_norm": 4.111586570739746, - "learning_rate": 2.3795867653469072e-06, - "loss": 0.0784, - "step": 841 - }, - { - "epoch": 5.1656441717791415, - "grad_norm": 2.3306312561035156, - "learning_rate": 2.374774190666211e-06, - "loss": 0.0216, - "step": 842 - }, - { - "epoch": 5.171779141104294, - "grad_norm": 2.5006275177001953, - "learning_rate": 2.3699620811625327e-06, - "loss": 0.0516, - "step": 843 - }, - { - "epoch": 5.177914110429448, - "grad_norm": 3.1680967807769775, - "learning_rate": 2.365150454711441e-06, - "loss": 0.0517, - "step": 844 - }, - { - "epoch": 5.184049079754601, - "grad_norm": 1.817044734954834, - "learning_rate": 2.3603393291867122e-06, - "loss": 0.0264, - "step": 845 - }, - { - "epoch": 5.190184049079755, - "grad_norm": 4.445211887359619, - "learning_rate": 2.355528722460261e-06, - "loss": 0.1079, - "step": 846 - }, - { - "epoch": 5.196319018404908, - "grad_norm": 2.918304681777954, - "learning_rate": 2.350718652402076e-06, - "loss": 0.0633, - "step": 847 - }, - { - "epoch": 5.2024539877300615, - "grad_norm": 3.6307432651519775, - "learning_rate": 2.345909136880151e-06, - "loss": 0.1013, - "step": 848 - }, - { - "epoch": 5.208588957055214, - "grad_norm": 3.5696842670440674, - "learning_rate": 2.34110019376042e-06, - "loss": 0.0199, - "step": 849 - }, - { - "epoch": 5.214723926380368, - "grad_norm": 2.2214856147766113, - "learning_rate": 2.336291840906691e-06, - "loss": 0.0288, - "step": 850 - }, - { - "epoch": 5.220858895705521, - "grad_norm": 2.5375778675079346, - "learning_rate": 2.3314840961805806e-06, - "loss": 0.0142, - "step": 851 - }, - { - "epoch": 5.226993865030675, - "grad_norm": 3.0093517303466797, - "learning_rate": 2.326676977441444e-06, - "loss": 0.0911, - "step": 852 - }, - { - "epoch": 5.233128834355828, - "grad_norm": 2.7067151069641113, - "learning_rate": 2.3218705025463118e-06, - "loss": 0.0315, - "step": 853 - }, - { - "epoch": 5.2392638036809815, - "grad_norm": 3.1892940998077393, - "learning_rate": 2.3170646893498237e-06, - "loss": 0.1344, - "step": 854 - }, - { - "epoch": 5.245398773006135, - "grad_norm": 2.8909313678741455, - "learning_rate": 2.312259555704161e-06, - "loss": 0.034, - "step": 855 - }, - { - "epoch": 5.251533742331288, - "grad_norm": 5.097650051116943, - "learning_rate": 2.3074551194589816e-06, - "loss": 0.1889, - "step": 856 - }, - { - "epoch": 5.257668711656442, - "grad_norm": 3.8511006832122803, - "learning_rate": 2.3026513984613506e-06, - "loss": 0.0794, - "step": 857 - }, - { - "epoch": 5.263803680981595, - "grad_norm": 2.2874133586883545, - "learning_rate": 2.297848410555677e-06, - "loss": 0.0238, - "step": 858 - }, - { - "epoch": 5.269938650306749, - "grad_norm": 3.504723310470581, - "learning_rate": 2.293046173583648e-06, - "loss": 0.0369, - "step": 859 - }, - { - "epoch": 5.276073619631902, - "grad_norm": 3.2108154296875, - "learning_rate": 2.28824470538416e-06, - "loss": 0.0677, - "step": 860 - }, - { - "epoch": 5.282208588957055, - "grad_norm": 2.2249386310577393, - "learning_rate": 2.2834440237932537e-06, - "loss": 0.0244, - "step": 861 - }, - { - "epoch": 5.288343558282208, - "grad_norm": 3.141784191131592, - "learning_rate": 2.2786441466440474e-06, - "loss": 0.0628, - "step": 862 - }, - { - "epoch": 5.294478527607362, - "grad_norm": 3.5597352981567383, - "learning_rate": 2.2738450917666727e-06, - "loss": 0.0914, - "step": 863 - }, - { - "epoch": 5.300613496932515, - "grad_norm": 2.991966962814331, - "learning_rate": 2.269046876988204e-06, - "loss": 0.0546, - "step": 864 - }, - { - "epoch": 5.306748466257669, - "grad_norm": 3.100776195526123, - "learning_rate": 2.2642495201325995e-06, - "loss": 0.0473, - "step": 865 - }, - { - "epoch": 5.3128834355828225, - "grad_norm": 2.541754722595215, - "learning_rate": 2.259453039020626e-06, - "loss": 0.0613, - "step": 866 - }, - { - "epoch": 5.319018404907975, - "grad_norm": 2.8117194175720215, - "learning_rate": 2.2546574514697985e-06, - "loss": 0.0533, - "step": 867 - }, - { - "epoch": 5.325153374233129, - "grad_norm": 2.5676379203796387, - "learning_rate": 2.249862775294313e-06, - "loss": 0.018, - "step": 868 - }, - { - "epoch": 5.331288343558282, - "grad_norm": 2.5297701358795166, - "learning_rate": 2.245069028304981e-06, - "loss": 0.0246, - "step": 869 - }, - { - "epoch": 5.337423312883436, - "grad_norm": 2.199498176574707, - "learning_rate": 2.240276228309161e-06, - "loss": 0.0551, - "step": 870 - }, - { - "epoch": 5.343558282208589, - "grad_norm": 2.5793557167053223, - "learning_rate": 2.2354843931106933e-06, - "loss": 0.0258, - "step": 871 - }, - { - "epoch": 5.3496932515337425, - "grad_norm": 3.352058172225952, - "learning_rate": 2.230693540509836e-06, - "loss": 0.0228, - "step": 872 - }, - { - "epoch": 5.355828220858895, - "grad_norm": 2.900599956512451, - "learning_rate": 2.225903688303195e-06, - "loss": 0.0586, - "step": 873 - }, - { - "epoch": 5.361963190184049, - "grad_norm": 3.3317267894744873, - "learning_rate": 2.221114854283662e-06, - "loss": 0.0733, - "step": 874 - }, - { - "epoch": 5.368098159509202, - "grad_norm": 2.79304575920105, - "learning_rate": 2.2163270562403453e-06, - "loss": 0.0251, - "step": 875 - }, - { - "epoch": 5.374233128834356, - "grad_norm": 3.8596227169036865, - "learning_rate": 2.211540311958506e-06, - "loss": 0.0957, - "step": 876 - }, - { - "epoch": 5.38036809815951, - "grad_norm": 2.7464358806610107, - "learning_rate": 2.2067546392194888e-06, - "loss": 0.0457, - "step": 877 - }, - { - "epoch": 5.386503067484663, - "grad_norm": 2.3359906673431396, - "learning_rate": 2.2019700558006598e-06, - "loss": 0.0218, - "step": 878 - }, - { - "epoch": 5.392638036809816, - "grad_norm": 3.2412452697753906, - "learning_rate": 2.197186579475337e-06, - "loss": 0.0494, - "step": 879 - }, - { - "epoch": 5.398773006134969, - "grad_norm": 3.930197238922119, - "learning_rate": 2.1924042280127284e-06, - "loss": 0.0803, - "step": 880 - }, - { - "epoch": 5.404907975460123, - "grad_norm": 2.5752930641174316, - "learning_rate": 2.1876230191778598e-06, - "loss": 0.0356, - "step": 881 - }, - { - "epoch": 5.411042944785276, - "grad_norm": 5.507393836975098, - "learning_rate": 2.182842970731516e-06, - "loss": 0.1245, - "step": 882 - }, - { - "epoch": 5.41717791411043, - "grad_norm": 2.416719436645508, - "learning_rate": 2.17806410043017e-06, - "loss": 0.0224, - "step": 883 - }, - { - "epoch": 5.423312883435583, - "grad_norm": 2.500429630279541, - "learning_rate": 2.173286426025917e-06, - "loss": 0.0499, - "step": 884 - }, - { - "epoch": 5.429447852760736, - "grad_norm": 2.8843860626220703, - "learning_rate": 2.168509965266411e-06, - "loss": 0.075, - "step": 885 - }, - { - "epoch": 5.435582822085889, - "grad_norm": 2.3187198638916016, - "learning_rate": 2.1637347358947984e-06, - "loss": 0.065, - "step": 886 - }, - { - "epoch": 5.441717791411043, - "grad_norm": 2.7135889530181885, - "learning_rate": 2.15896075564965e-06, - "loss": 0.0848, - "step": 887 - }, - { - "epoch": 5.447852760736196, - "grad_norm": 1.751846194267273, - "learning_rate": 2.1541880422648978e-06, - "loss": 0.0112, - "step": 888 - }, - { - "epoch": 5.45398773006135, - "grad_norm": 3.113271713256836, - "learning_rate": 2.1494166134697655e-06, - "loss": 0.077, - "step": 889 - }, - { - "epoch": 5.460122699386503, - "grad_norm": 2.711318016052246, - "learning_rate": 2.1446464869887077e-06, - "loss": 0.03, - "step": 890 - }, - { - "epoch": 5.466257668711656, - "grad_norm": 1.8012003898620605, - "learning_rate": 2.13987768054134e-06, - "loss": 0.0141, - "step": 891 - }, - { - "epoch": 5.47239263803681, - "grad_norm": 2.0968120098114014, - "learning_rate": 2.135110211842374e-06, - "loss": 0.0147, - "step": 892 - }, - { - "epoch": 5.478527607361963, - "grad_norm": 3.1689956188201904, - "learning_rate": 2.1303440986015525e-06, - "loss": 0.1123, - "step": 893 - }, - { - "epoch": 5.484662576687117, - "grad_norm": 4.512697219848633, - "learning_rate": 2.1255793585235827e-06, - "loss": 0.0359, - "step": 894 - }, - { - "epoch": 5.49079754601227, - "grad_norm": 3.5739688873291016, - "learning_rate": 2.120816009308071e-06, - "loss": 0.0635, - "step": 895 - }, - { - "epoch": 5.4969325153374236, - "grad_norm": 4.556554317474365, - "learning_rate": 2.1160540686494597e-06, - "loss": 0.1104, - "step": 896 - }, - { - "epoch": 5.5030674846625764, - "grad_norm": 2.2047064304351807, - "learning_rate": 2.1112935542369546e-06, - "loss": 0.0187, - "step": 897 - }, - { - "epoch": 5.50920245398773, - "grad_norm": 3.0289857387542725, - "learning_rate": 2.106534483754466e-06, - "loss": 0.0874, - "step": 898 - }, - { - "epoch": 5.515337423312883, - "grad_norm": 2.7090444564819336, - "learning_rate": 2.1017768748805396e-06, - "loss": 0.0301, - "step": 899 - }, - { - "epoch": 5.521472392638037, - "grad_norm": 3.0662643909454346, - "learning_rate": 2.0970207452882917e-06, - "loss": 0.1192, - "step": 900 - }, - { - "epoch": 5.52760736196319, - "grad_norm": 2.869401454925537, - "learning_rate": 2.0922661126453436e-06, - "loss": 0.0803, - "step": 901 - }, - { - "epoch": 5.533742331288344, - "grad_norm": 2.229947328567505, - "learning_rate": 2.0875129946137557e-06, - "loss": 0.0186, - "step": 902 - }, - { - "epoch": 5.539877300613497, - "grad_norm": 3.3460421562194824, - "learning_rate": 2.0827614088499624e-06, - "loss": 0.0499, - "step": 903 - }, - { - "epoch": 5.54601226993865, - "grad_norm": 1.9324007034301758, - "learning_rate": 2.0780113730047056e-06, - "loss": 0.0322, - "step": 904 - }, - { - "epoch": 5.552147239263804, - "grad_norm": 2.761482000350952, - "learning_rate": 2.0732629047229712e-06, - "loss": 0.0265, - "step": 905 - }, - { - "epoch": 5.558282208588957, - "grad_norm": 2.4173266887664795, - "learning_rate": 2.0685160216439205e-06, - "loss": 0.0229, - "step": 906 - }, - { - "epoch": 5.564417177914111, - "grad_norm": 2.503661632537842, - "learning_rate": 2.0637707414008267e-06, - "loss": 0.0266, - "step": 907 - }, - { - "epoch": 5.570552147239264, - "grad_norm": 2.312236785888672, - "learning_rate": 2.0590270816210077e-06, - "loss": 0.018, - "step": 908 - }, - { - "epoch": 5.576687116564417, - "grad_norm": 2.569575548171997, - "learning_rate": 2.0542850599257647e-06, - "loss": 0.0377, - "step": 909 - }, - { - "epoch": 5.58282208588957, - "grad_norm": 3.520341157913208, - "learning_rate": 2.0495446939303122e-06, - "loss": 0.1224, - "step": 910 - }, - { - "epoch": 5.588957055214724, - "grad_norm": 3.231363296508789, - "learning_rate": 2.044806001243714e-06, - "loss": 0.1457, - "step": 911 - }, - { - "epoch": 5.595092024539877, - "grad_norm": 3.3211300373077393, - "learning_rate": 2.040068999468818e-06, - "loss": 0.0429, - "step": 912 - }, - { - "epoch": 5.601226993865031, - "grad_norm": 3.3712961673736572, - "learning_rate": 2.035333706202192e-06, - "loss": 0.0634, - "step": 913 - }, - { - "epoch": 5.6073619631901845, - "grad_norm": 2.480177402496338, - "learning_rate": 2.0306001390340565e-06, - "loss": 0.0178, - "step": 914 - }, - { - "epoch": 5.613496932515337, - "grad_norm": 2.9777421951293945, - "learning_rate": 2.02586831554822e-06, - "loss": 0.037, - "step": 915 - }, - { - "epoch": 5.61963190184049, - "grad_norm": 2.9129085540771484, - "learning_rate": 2.021138253322012e-06, - "loss": 0.125, - "step": 916 - }, - { - "epoch": 5.625766871165644, - "grad_norm": 4.041767597198486, - "learning_rate": 2.016409969926224e-06, - "loss": 0.1897, - "step": 917 - }, - { - "epoch": 5.631901840490798, - "grad_norm": 4.088902950286865, - "learning_rate": 2.0116834829250355e-06, - "loss": 0.0546, - "step": 918 - }, - { - "epoch": 5.638036809815951, - "grad_norm": 3.8629167079925537, - "learning_rate": 2.0069588098759545e-06, - "loss": 0.0911, - "step": 919 - }, - { - "epoch": 5.644171779141105, - "grad_norm": 2.616830825805664, - "learning_rate": 2.00223596832975e-06, - "loss": 0.0527, - "step": 920 - }, - { - "epoch": 5.6503067484662575, - "grad_norm": 1.9370782375335693, - "learning_rate": 1.9975149758303885e-06, - "loss": 0.0384, - "step": 921 - }, - { - "epoch": 5.656441717791411, - "grad_norm": 3.7839455604553223, - "learning_rate": 1.992795849914967e-06, - "loss": 0.1033, - "step": 922 - }, - { - "epoch": 5.662576687116564, - "grad_norm": 3.870729923248291, - "learning_rate": 1.9880786081136498e-06, - "loss": 0.08, - "step": 923 - }, - { - "epoch": 5.668711656441718, - "grad_norm": 3.4394288063049316, - "learning_rate": 1.9833632679496008e-06, - "loss": 0.0819, - "step": 924 - }, - { - "epoch": 5.674846625766871, - "grad_norm": 3.1659159660339355, - "learning_rate": 1.97864984693892e-06, - "loss": 0.117, - "step": 925 - }, - { - "epoch": 5.680981595092025, - "grad_norm": 2.2375190258026123, - "learning_rate": 1.97393836259058e-06, - "loss": 0.0215, - "step": 926 - }, - { - "epoch": 5.6871165644171775, - "grad_norm": 3.9375314712524414, - "learning_rate": 1.969228832406358e-06, - "loss": 0.1422, - "step": 927 - }, - { - "epoch": 5.693251533742331, - "grad_norm": 3.1969058513641357, - "learning_rate": 1.964521273880772e-06, - "loss": 0.0538, - "step": 928 - }, - { - "epoch": 5.699386503067485, - "grad_norm": 3.5990066528320312, - "learning_rate": 1.9598157045010162e-06, - "loss": 0.114, - "step": 929 - }, - { - "epoch": 5.705521472392638, - "grad_norm": 3.1764235496520996, - "learning_rate": 1.9551121417468955e-06, - "loss": 0.053, - "step": 930 - }, - { - "epoch": 5.711656441717792, - "grad_norm": 4.1162309646606445, - "learning_rate": 1.9504106030907605e-06, - "loss": 0.0866, - "step": 931 - }, - { - "epoch": 5.717791411042945, - "grad_norm": 3.543071985244751, - "learning_rate": 1.945711105997444e-06, - "loss": 0.0908, - "step": 932 - }, - { - "epoch": 5.723926380368098, - "grad_norm": 4.136870384216309, - "learning_rate": 1.941013667924194e-06, - "loss": 0.0612, - "step": 933 - }, - { - "epoch": 5.730061349693251, - "grad_norm": 1.7658357620239258, - "learning_rate": 1.9363183063206097e-06, - "loss": 0.0283, - "step": 934 - }, - { - "epoch": 5.736196319018405, - "grad_norm": 3.9701411724090576, - "learning_rate": 1.931625038628577e-06, - "loss": 0.0948, - "step": 935 - }, - { - "epoch": 5.742331288343558, - "grad_norm": 3.0636157989501953, - "learning_rate": 1.9269338822822047e-06, - "loss": 0.0769, - "step": 936 - }, - { - "epoch": 5.748466257668712, - "grad_norm": 3.3671388626098633, - "learning_rate": 1.9222448547077573e-06, - "loss": 0.098, - "step": 937 - }, - { - "epoch": 5.754601226993865, - "grad_norm": 3.0725975036621094, - "learning_rate": 1.917557973323591e-06, - "loss": 0.0363, - "step": 938 - }, - { - "epoch": 5.7607361963190185, - "grad_norm": 2.5592041015625, - "learning_rate": 1.9128732555400915e-06, - "loss": 0.0205, - "step": 939 - }, - { - "epoch": 5.766871165644172, - "grad_norm": 2.835740804672241, - "learning_rate": 1.9081907187596054e-06, - "loss": 0.0548, - "step": 940 - }, - { - "epoch": 5.773006134969325, - "grad_norm": 3.3596746921539307, - "learning_rate": 1.9035103803763793e-06, - "loss": 0.0454, - "step": 941 - }, - { - "epoch": 5.779141104294479, - "grad_norm": 3.226579427719116, - "learning_rate": 1.8988322577764918e-06, - "loss": 0.0514, - "step": 942 - }, - { - "epoch": 5.785276073619632, - "grad_norm": 3.2044687271118164, - "learning_rate": 1.8941563683377905e-06, - "loss": 0.1361, - "step": 943 - }, - { - "epoch": 5.791411042944786, - "grad_norm": 1.8300527334213257, - "learning_rate": 1.8894827294298296e-06, - "loss": 0.0139, - "step": 944 - }, - { - "epoch": 5.7975460122699385, - "grad_norm": 2.503735303878784, - "learning_rate": 1.884811358413801e-06, - "loss": 0.0311, - "step": 945 - }, - { - "epoch": 5.803680981595092, - "grad_norm": 2.171309471130371, - "learning_rate": 1.8801422726424735e-06, - "loss": 0.0227, - "step": 946 - }, - { - "epoch": 5.809815950920245, - "grad_norm": 1.8116636276245117, - "learning_rate": 1.8754754894601252e-06, - "loss": 0.0157, - "step": 947 - }, - { - "epoch": 5.815950920245399, - "grad_norm": 3.1412570476531982, - "learning_rate": 1.870811026202482e-06, - "loss": 0.1093, - "step": 948 - }, - { - "epoch": 5.822085889570552, - "grad_norm": 2.3962290287017822, - "learning_rate": 1.8661489001966526e-06, - "loss": 0.021, - "step": 949 - }, - { - "epoch": 5.828220858895706, - "grad_norm": 4.169166564941406, - "learning_rate": 1.8614891287610621e-06, - "loss": 0.0663, - "step": 950 - }, - { - "epoch": 5.8343558282208585, - "grad_norm": 3.1181528568267822, - "learning_rate": 1.8568317292053894e-06, - "loss": 0.1008, - "step": 951 - }, - { - "epoch": 5.840490797546012, - "grad_norm": 3.5155029296875, - "learning_rate": 1.8521767188305023e-06, - "loss": 0.0451, - "step": 952 - }, - { - "epoch": 5.846625766871165, - "grad_norm": 2.975693702697754, - "learning_rate": 1.8475241149283957e-06, - "loss": 0.0561, - "step": 953 - }, - { - "epoch": 5.852760736196319, - "grad_norm": 2.1581289768218994, - "learning_rate": 1.842873934782122e-06, - "loss": 0.0265, - "step": 954 - }, - { - "epoch": 5.858895705521473, - "grad_norm": 2.6281228065490723, - "learning_rate": 1.8382261956657318e-06, - "loss": 0.1196, - "step": 955 - }, - { - "epoch": 5.865030674846626, - "grad_norm": 2.9569528102874756, - "learning_rate": 1.8335809148442074e-06, - "loss": 0.1356, - "step": 956 - }, - { - "epoch": 5.871165644171779, - "grad_norm": 2.450949192047119, - "learning_rate": 1.8289381095734005e-06, - "loss": 0.0444, - "step": 957 - }, - { - "epoch": 5.877300613496932, - "grad_norm": 2.1737027168273926, - "learning_rate": 1.8242977970999643e-06, - "loss": 0.0622, - "step": 958 - }, - { - "epoch": 5.883435582822086, - "grad_norm": 3.350647211074829, - "learning_rate": 1.8196599946612956e-06, - "loss": 0.0762, - "step": 959 - }, - { - "epoch": 5.889570552147239, - "grad_norm": 2.5031936168670654, - "learning_rate": 1.8150247194854642e-06, - "loss": 0.0207, - "step": 960 - }, - { - "epoch": 5.895705521472393, - "grad_norm": 3.7103707790374756, - "learning_rate": 1.8103919887911525e-06, - "loss": 0.1122, - "step": 961 - }, - { - "epoch": 5.901840490797546, - "grad_norm": 2.485322952270508, - "learning_rate": 1.8057618197875914e-06, - "loss": 0.0284, - "step": 962 - }, - { - "epoch": 5.9079754601226995, - "grad_norm": 1.903212547302246, - "learning_rate": 1.8011342296744961e-06, - "loss": 0.0239, - "step": 963 - }, - { - "epoch": 5.914110429447852, - "grad_norm": 3.015552520751953, - "learning_rate": 1.796509235642001e-06, - "loss": 0.0425, - "step": 964 - }, - { - "epoch": 5.920245398773006, - "grad_norm": 4.806198596954346, - "learning_rate": 1.7918868548705982e-06, - "loss": 0.2094, - "step": 965 - }, - { - "epoch": 5.92638036809816, - "grad_norm": 2.949596643447876, - "learning_rate": 1.7872671045310703e-06, - "loss": 0.0632, - "step": 966 - }, - { - "epoch": 5.932515337423313, - "grad_norm": 4.153099536895752, - "learning_rate": 1.782650001784431e-06, - "loss": 0.1411, - "step": 967 - }, - { - "epoch": 5.938650306748467, - "grad_norm": 3.4117565155029297, - "learning_rate": 1.7780355637818568e-06, - "loss": 0.0965, - "step": 968 - }, - { - "epoch": 5.9447852760736195, - "grad_norm": 2.533405303955078, - "learning_rate": 1.7734238076646277e-06, - "loss": 0.0568, - "step": 969 - }, - { - "epoch": 5.950920245398773, - "grad_norm": 2.3604726791381836, - "learning_rate": 1.7688147505640581e-06, - "loss": 0.0182, - "step": 970 - }, - { - "epoch": 5.957055214723926, - "grad_norm": 3.807424306869507, - "learning_rate": 1.7642084096014405e-06, - "loss": 0.0547, - "step": 971 - }, - { - "epoch": 5.96319018404908, - "grad_norm": 2.5735342502593994, - "learning_rate": 1.759604801887974e-06, - "loss": 0.0775, - "step": 972 - }, - { - "epoch": 5.969325153374233, - "grad_norm": 2.9217734336853027, - "learning_rate": 1.7550039445247069e-06, - "loss": 0.0541, - "step": 973 - }, - { - "epoch": 5.975460122699387, - "grad_norm": 2.793104410171509, - "learning_rate": 1.7504058546024694e-06, - "loss": 0.0257, - "step": 974 - }, - { - "epoch": 5.9815950920245395, - "grad_norm": 3.5610134601593018, - "learning_rate": 1.7458105492018114e-06, - "loss": 0.0767, - "step": 975 - }, - { - "epoch": 5.987730061349693, - "grad_norm": 2.0738015174865723, - "learning_rate": 1.7412180453929412e-06, - "loss": 0.025, - "step": 976 - }, - { - "epoch": 5.993865030674847, - "grad_norm": 2.1248421669006348, - "learning_rate": 1.736628360235657e-06, - "loss": 0.0183, - "step": 977 - }, - { - "epoch": 6.0, - "grad_norm": 2.901273727416992, - "learning_rate": 1.7320415107792893e-06, - "loss": 0.1369, - "step": 978 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 2.4215941729694515e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/config.json b/metallama3_8b/limo_filtered_correct/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_correct/generation_config.json b/metallama3_8b/limo_filtered_correct/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_correct/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/model-00001-of-00007.safetensors deleted file mode 100644 index ef6a28c1e9e227311a947d238a462b8f1aea5688..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34d9387733813d5a6f8cf9e9e1500319b6f18f7239cd826153c357461c42879f -size 4886466168 diff --git a/metallama3_8b/limo_filtered_correct/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/model-00002-of-00007.safetensors deleted file mode 100644 index 5e86287f4f41b8de773a003247237b94722cb296..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69e312ac446f71fe286935413a92e19b0f116e972bb4ecca7ba373d35234e258 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_correct/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/model-00003-of-00007.safetensors deleted file mode 100644 index d26dcf2f537cfdd35b71e7e5f3e5f77ed2ae968a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8397232e2e465a66601fe6839f1b4887250275646947034ff16ec493807ab154 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_correct/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/model-00004-of-00007.safetensors deleted file mode 100644 index 327b5dccda8ad855f23f2ac1355d727fd228d9d9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:474e541c2f4198dbf42cb93fb0a0cdc9e27156ed753bacd7af9c6fca13cb52b4 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_correct/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/model-00005-of-00007.safetensors deleted file mode 100644 index 76ac6d2a48b10e807ac6e61f11e719484922b8f3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7bb3488785efc6e3418aead91bb86553e6fcfc4d713e959fdd2d62433567995 -size 4832007496 diff --git a/metallama3_8b/limo_filtered_correct/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/model-00006-of-00007.safetensors deleted file mode 100644 index 1d8c9f53c70a09307b6e0d46bb887ffdd6af8bb2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:811f0ee64a0ba18e4205bfa0e9846dd8210c4e626fb40e748bce749337209456 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_correct/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_correct/model-00007-of-00007.safetensors deleted file mode 100644 index b86d062c36a8e520ca6dbb5e05275e007796a4f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cfff6d639879b21feabc96e90bb89213b0247c933b0769c3810d4ffeb63ff91f -size 2571158184 diff --git a/metallama3_8b/limo_filtered_correct/model.safetensors.index.json b/metallama3_8b/limo_filtered_correct/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_correct/special_tokens_map.json b/metallama3_8b/limo_filtered_correct/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_correct/tokenizer.json b/metallama3_8b/limo_filtered_correct/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_correct/tokenizer_config.json b/metallama3_8b/limo_filtered_correct/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_correct/train_results.json b/metallama3_8b/limo_filtered_correct/train_results.json deleted file mode 100644 index e1a6b96221a02b32dc2712f312419265a6f74078..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/train_results.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "epoch": 10.0, - "total_flos": 4.036761107572982e+17, - "train_loss": 0.23243108529037226, - "train_runtime": 7070.6548, - "train_samples_per_second": 0.921, - "train_steps_per_second": 0.231 -} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_correct/trainer_log.jsonl b/metallama3_8b/limo_filtered_correct/trainer_log.jsonl deleted file mode 100644 index fa78ecd8784f422f84547b207567732ae9c96053..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/trainer_log.jsonl +++ /dev/null @@ -1,1631 +0,0 @@ -{"current_steps": 1, "total_steps": 1630, "loss": 0.9606, "lr": 5e-06, "epoch": 0.006134969325153374, "percentage": 0.06, "elapsed_time": "0:00:03", "remaining_time": "1:22:45"} -{"current_steps": 2, "total_steps": 1630, "loss": 0.8609, "lr": 4.999995356617983e-06, "epoch": 0.012269938650306749, "percentage": 0.12, "elapsed_time": "0:00:04", "remaining_time": "0:55:42"} -{"current_steps": 3, "total_steps": 1630, "loss": 1.3543, "lr": 4.999981426489179e-06, "epoch": 0.018404907975460124, "percentage": 0.18, "elapsed_time": "0:00:06", "remaining_time": "0:58:14"} -{"current_steps": 4, "total_steps": 1630, "loss": 0.787, "lr": 4.999958209665336e-06, "epoch": 0.024539877300613498, "percentage": 0.25, "elapsed_time": "0:00:07", "remaining_time": "0:53:21"} -{"current_steps": 5, "total_steps": 1630, "loss": 1.7786, "lr": 4.999925706232695e-06, "epoch": 0.03067484662576687, "percentage": 0.31, "elapsed_time": "0:00:13", "remaining_time": "1:15:19"} -{"current_steps": 6, "total_steps": 1630, "loss": 1.2175, "lr": 4.999883916312e-06, "epoch": 0.03680981595092025, "percentage": 0.37, "elapsed_time": "0:00:17", "remaining_time": "1:17:16"} -{"current_steps": 7, "total_steps": 1630, "loss": 0.8998, "lr": 4.9998328400584864e-06, "epoch": 0.04294478527607362, "percentage": 0.43, "elapsed_time": "0:00:20", "remaining_time": "1:17:45"} -{"current_steps": 8, "total_steps": 1630, "loss": 0.8419, "lr": 4.999772477661888e-06, "epoch": 0.049079754601226995, "percentage": 0.49, "elapsed_time": "0:00:21", "remaining_time": "1:11:22"} -{"current_steps": 9, "total_steps": 1630, "loss": 1.7948, "lr": 4.999702829346432e-06, "epoch": 0.05521472392638037, "percentage": 0.55, "elapsed_time": "0:00:27", "remaining_time": "1:21:08"} -{"current_steps": 10, "total_steps": 1630, "loss": 1.0461, "lr": 4.999623895370843e-06, "epoch": 0.06134969325153374, "percentage": 0.61, "elapsed_time": "0:00:28", "remaining_time": "1:17:52"} -{"current_steps": 11, "total_steps": 1630, "loss": 1.0, "lr": 4.999535676028338e-06, "epoch": 0.06748466257668712, "percentage": 0.67, "elapsed_time": "0:00:32", "remaining_time": "1:20:04"} -{"current_steps": 12, "total_steps": 1630, "loss": 0.9475, "lr": 4.999438171646624e-06, "epoch": 0.0736196319018405, "percentage": 0.74, "elapsed_time": "0:00:33", "remaining_time": "1:16:03"} -{"current_steps": 13, "total_steps": 1630, "loss": 0.8654, "lr": 4.999331382587901e-06, "epoch": 0.07975460122699386, "percentage": 0.8, "elapsed_time": "0:00:35", "remaining_time": "1:13:44"} -{"current_steps": 14, "total_steps": 1630, "loss": 1.2042, "lr": 4.999215309248861e-06, "epoch": 0.08588957055214724, "percentage": 0.86, "elapsed_time": "0:00:39", "remaining_time": "1:16:11"} -{"current_steps": 15, "total_steps": 1630, "loss": 0.8846, "lr": 4.999089952060681e-06, "epoch": 0.09202453987730061, "percentage": 0.92, "elapsed_time": "0:00:41", "remaining_time": "1:14:45"} -{"current_steps": 16, "total_steps": 1630, "loss": 0.8805, "lr": 4.998955311489025e-06, "epoch": 0.09815950920245399, "percentage": 0.98, "elapsed_time": "0:00:43", "remaining_time": "1:13:55"} -{"current_steps": 17, "total_steps": 1630, "loss": 1.5882, "lr": 4.998811388034046e-06, "epoch": 0.10429447852760736, "percentage": 1.04, "elapsed_time": "0:00:47", "remaining_time": "1:14:59"} -{"current_steps": 18, "total_steps": 1630, "loss": 0.9222, "lr": 4.9986581822303746e-06, "epoch": 0.11042944785276074, "percentage": 1.1, "elapsed_time": "0:00:51", "remaining_time": "1:16:33"} -{"current_steps": 19, "total_steps": 1630, "loss": 1.4088, "lr": 4.998495694647127e-06, "epoch": 0.1165644171779141, "percentage": 1.17, "elapsed_time": "0:00:54", "remaining_time": "1:17:24"} -{"current_steps": 20, "total_steps": 1630, "loss": 1.454, "lr": 4.998323925887895e-06, "epoch": 0.12269938650306748, "percentage": 1.23, "elapsed_time": "0:00:56", "remaining_time": "1:15:57"} -{"current_steps": 21, "total_steps": 1630, "loss": 0.6335, "lr": 4.998142876590749e-06, "epoch": 0.12883435582822086, "percentage": 1.29, "elapsed_time": "0:00:57", "remaining_time": "1:13:38"} -{"current_steps": 22, "total_steps": 1630, "loss": 0.6725, "lr": 4.997952547428236e-06, "epoch": 0.13496932515337423, "percentage": 1.35, "elapsed_time": "0:01:01", "remaining_time": "1:14:35"} -{"current_steps": 23, "total_steps": 1630, "loss": 0.7814, "lr": 4.997752939107372e-06, "epoch": 0.1411042944785276, "percentage": 1.41, "elapsed_time": "0:01:05", "remaining_time": "1:15:49"} -{"current_steps": 24, "total_steps": 1630, "loss": 0.9683, "lr": 4.997544052369642e-06, "epoch": 0.147239263803681, "percentage": 1.47, "elapsed_time": "0:01:08", "remaining_time": "1:16:29"} -{"current_steps": 25, "total_steps": 1630, "loss": 0.9414, "lr": 4.997325887990999e-06, "epoch": 0.15337423312883436, "percentage": 1.53, "elapsed_time": "0:01:11", "remaining_time": "1:16:28"} -{"current_steps": 26, "total_steps": 1630, "loss": 0.8894, "lr": 4.997098446781861e-06, "epoch": 0.15950920245398773, "percentage": 1.6, "elapsed_time": "0:01:15", "remaining_time": "1:17:50"} -{"current_steps": 27, "total_steps": 1630, "loss": 0.7708, "lr": 4.996861729587103e-06, "epoch": 0.1656441717791411, "percentage": 1.66, "elapsed_time": "0:01:17", "remaining_time": "1:16:46"} -{"current_steps": 28, "total_steps": 1630, "loss": 0.6995, "lr": 4.996615737286061e-06, "epoch": 0.17177914110429449, "percentage": 1.72, "elapsed_time": "0:01:21", "remaining_time": "1:17:19"} -{"current_steps": 29, "total_steps": 1630, "loss": 1.2563, "lr": 4.996360470792524e-06, "epoch": 0.17791411042944785, "percentage": 1.78, "elapsed_time": "0:01:24", "remaining_time": "1:17:43"} -{"current_steps": 30, "total_steps": 1630, "loss": 0.7266, "lr": 4.996095931054731e-06, "epoch": 0.18404907975460122, "percentage": 1.84, "elapsed_time": "0:01:25", "remaining_time": "1:16:07"} -{"current_steps": 31, "total_steps": 1630, "loss": 0.9227, "lr": 4.9958221190553705e-06, "epoch": 0.1901840490797546, "percentage": 1.9, "elapsed_time": "0:01:31", "remaining_time": "1:18:27"} -{"current_steps": 32, "total_steps": 1630, "loss": 0.701, "lr": 4.995539035811572e-06, "epoch": 0.19631901840490798, "percentage": 1.96, "elapsed_time": "0:01:32", "remaining_time": "1:17:05"} -{"current_steps": 33, "total_steps": 1630, "loss": 0.6491, "lr": 4.9952466823749076e-06, "epoch": 0.20245398773006135, "percentage": 2.02, "elapsed_time": "0:01:34", "remaining_time": "1:16:10"} -{"current_steps": 34, "total_steps": 1630, "loss": 0.8029, "lr": 4.9949450598313835e-06, "epoch": 0.2085889570552147, "percentage": 2.09, "elapsed_time": "0:01:36", "remaining_time": "1:15:12"} -{"current_steps": 35, "total_steps": 1630, "loss": 0.8785, "lr": 4.994634169301439e-06, "epoch": 0.2147239263803681, "percentage": 2.15, "elapsed_time": "0:01:39", "remaining_time": "1:15:36"} -{"current_steps": 36, "total_steps": 1630, "loss": 1.034, "lr": 4.994314011939941e-06, "epoch": 0.22085889570552147, "percentage": 2.21, "elapsed_time": "0:01:41", "remaining_time": "1:15:02"} -{"current_steps": 37, "total_steps": 1630, "loss": 0.8557, "lr": 4.99398458893618e-06, "epoch": 0.22699386503067484, "percentage": 2.27, "elapsed_time": "0:01:43", "remaining_time": "1:14:15"} -{"current_steps": 38, "total_steps": 1630, "loss": 1.1904, "lr": 4.993645901513865e-06, "epoch": 0.2331288343558282, "percentage": 2.33, "elapsed_time": "0:01:48", "remaining_time": "1:15:59"} -{"current_steps": 39, "total_steps": 1630, "loss": 0.7668, "lr": 4.993297950931121e-06, "epoch": 0.2392638036809816, "percentage": 2.39, "elapsed_time": "0:01:50", "remaining_time": "1:15:13"} -{"current_steps": 40, "total_steps": 1630, "loss": 0.8812, "lr": 4.9929407384804806e-06, "epoch": 0.24539877300613497, "percentage": 2.45, "elapsed_time": "0:01:52", "remaining_time": "1:14:47"} -{"current_steps": 41, "total_steps": 1630, "loss": 0.8878, "lr": 4.992574265488883e-06, "epoch": 0.25153374233128833, "percentage": 2.52, "elapsed_time": "0:01:57", "remaining_time": "1:15:42"} -{"current_steps": 42, "total_steps": 1630, "loss": 0.7251, "lr": 4.9921985333176694e-06, "epoch": 0.25766871165644173, "percentage": 2.58, "elapsed_time": "0:01:59", "remaining_time": "1:15:17"} -{"current_steps": 43, "total_steps": 1630, "loss": 0.6638, "lr": 4.991813543362572e-06, "epoch": 0.26380368098159507, "percentage": 2.64, "elapsed_time": "0:02:01", "remaining_time": "1:14:30"} -{"current_steps": 44, "total_steps": 1630, "loss": 1.0725, "lr": 4.991419297053716e-06, "epoch": 0.26993865030674846, "percentage": 2.7, "elapsed_time": "0:02:07", "remaining_time": "1:16:26"} -{"current_steps": 45, "total_steps": 1630, "loss": 0.7238, "lr": 4.991015795855611e-06, "epoch": 0.27607361963190186, "percentage": 2.76, "elapsed_time": "0:02:09", "remaining_time": "1:15:54"} -{"current_steps": 46, "total_steps": 1630, "loss": 0.9188, "lr": 4.990603041267144e-06, "epoch": 0.2822085889570552, "percentage": 2.82, "elapsed_time": "0:02:13", "remaining_time": "1:16:53"} -{"current_steps": 47, "total_steps": 1630, "loss": 0.6158, "lr": 4.990181034821578e-06, "epoch": 0.2883435582822086, "percentage": 2.88, "elapsed_time": "0:02:17", "remaining_time": "1:17:00"} -{"current_steps": 48, "total_steps": 1630, "loss": 0.7165, "lr": 4.98974977808654e-06, "epoch": 0.294478527607362, "percentage": 2.94, "elapsed_time": "0:02:19", "remaining_time": "1:16:36"} -{"current_steps": 49, "total_steps": 1630, "loss": 0.7277, "lr": 4.989309272664026e-06, "epoch": 0.3006134969325153, "percentage": 3.01, "elapsed_time": "0:02:21", "remaining_time": "1:16:06"} -{"current_steps": 50, "total_steps": 1630, "loss": 0.9793, "lr": 4.988859520190381e-06, "epoch": 0.3067484662576687, "percentage": 3.07, "elapsed_time": "0:02:22", "remaining_time": "1:15:12"} -{"current_steps": 51, "total_steps": 1630, "loss": 0.8966, "lr": 4.988400522336304e-06, "epoch": 0.3128834355828221, "percentage": 3.13, "elapsed_time": "0:02:24", "remaining_time": "1:14:23"} -{"current_steps": 52, "total_steps": 1630, "loss": 0.8191, "lr": 4.9879322808068365e-06, "epoch": 0.31901840490797545, "percentage": 3.19, "elapsed_time": "0:02:26", "remaining_time": "1:13:58"} -{"current_steps": 53, "total_steps": 1630, "loss": 0.6308, "lr": 4.987454797341358e-06, "epoch": 0.32515337423312884, "percentage": 3.25, "elapsed_time": "0:02:29", "remaining_time": "1:14:12"} -{"current_steps": 54, "total_steps": 1630, "loss": 0.8226, "lr": 4.98696807371358e-06, "epoch": 0.3312883435582822, "percentage": 3.31, "elapsed_time": "0:02:30", "remaining_time": "1:13:25"} -{"current_steps": 55, "total_steps": 1630, "loss": 0.9184, "lr": 4.986472111731536e-06, "epoch": 0.3374233128834356, "percentage": 3.37, "elapsed_time": "0:02:34", "remaining_time": "1:13:53"} -{"current_steps": 56, "total_steps": 1630, "loss": 0.6593, "lr": 4.985966913237581e-06, "epoch": 0.34355828220858897, "percentage": 3.44, "elapsed_time": "0:02:36", "remaining_time": "1:13:19"} -{"current_steps": 57, "total_steps": 1630, "loss": 0.6994, "lr": 4.985452480108376e-06, "epoch": 0.3496932515337423, "percentage": 3.5, "elapsed_time": "0:02:38", "remaining_time": "1:13:04"} -{"current_steps": 58, "total_steps": 1630, "loss": 1.1374, "lr": 4.984928814254889e-06, "epoch": 0.3558282208588957, "percentage": 3.56, "elapsed_time": "0:02:46", "remaining_time": "1:15:02"} -{"current_steps": 59, "total_steps": 1630, "loss": 0.8097, "lr": 4.984395917622387e-06, "epoch": 0.3619631901840491, "percentage": 3.62, "elapsed_time": "0:02:48", "remaining_time": "1:14:49"} -{"current_steps": 60, "total_steps": 1630, "loss": 0.8511, "lr": 4.9838537921904206e-06, "epoch": 0.36809815950920244, "percentage": 3.68, "elapsed_time": "0:02:51", "remaining_time": "1:14:49"} -{"current_steps": 61, "total_steps": 1630, "loss": 0.898, "lr": 4.9833024399728295e-06, "epoch": 0.37423312883435583, "percentage": 3.74, "elapsed_time": "0:02:53", "remaining_time": "1:14:29"} -{"current_steps": 62, "total_steps": 1630, "loss": 0.6671, "lr": 4.982741863017722e-06, "epoch": 0.3803680981595092, "percentage": 3.8, "elapsed_time": "0:02:56", "remaining_time": "1:14:11"} -{"current_steps": 63, "total_steps": 1630, "loss": 1.0559, "lr": 4.982172063407479e-06, "epoch": 0.38650306748466257, "percentage": 3.87, "elapsed_time": "0:03:01", "remaining_time": "1:15:16"} -{"current_steps": 64, "total_steps": 1630, "loss": 0.6663, "lr": 4.9815930432587365e-06, "epoch": 0.39263803680981596, "percentage": 3.93, "elapsed_time": "0:03:02", "remaining_time": "1:14:35"} -{"current_steps": 65, "total_steps": 1630, "loss": 0.6895, "lr": 4.981004804722384e-06, "epoch": 0.3987730061349693, "percentage": 3.99, "elapsed_time": "0:03:05", "remaining_time": "1:14:37"} -{"current_steps": 66, "total_steps": 1630, "loss": 0.7982, "lr": 4.980407349983556e-06, "epoch": 0.4049079754601227, "percentage": 4.05, "elapsed_time": "0:03:10", "remaining_time": "1:15:09"} -{"current_steps": 67, "total_steps": 1630, "loss": 0.6808, "lr": 4.979800681261619e-06, "epoch": 0.4110429447852761, "percentage": 4.11, "elapsed_time": "0:03:12", "remaining_time": "1:14:57"} -{"current_steps": 68, "total_steps": 1630, "loss": 0.567, "lr": 4.9791848008101705e-06, "epoch": 0.4171779141104294, "percentage": 4.17, "elapsed_time": "0:03:14", "remaining_time": "1:14:32"} -{"current_steps": 69, "total_steps": 1630, "loss": 0.7745, "lr": 4.978559710917024e-06, "epoch": 0.4233128834355828, "percentage": 4.23, "elapsed_time": "0:03:18", "remaining_time": "1:14:53"} -{"current_steps": 70, "total_steps": 1630, "loss": 0.9815, "lr": 4.977925413904205e-06, "epoch": 0.4294478527607362, "percentage": 4.29, "elapsed_time": "0:03:19", "remaining_time": "1:14:08"} -{"current_steps": 71, "total_steps": 1630, "loss": 1.164, "lr": 4.9772819121279395e-06, "epoch": 0.43558282208588955, "percentage": 4.36, "elapsed_time": "0:03:26", "remaining_time": "1:15:36"} -{"current_steps": 72, "total_steps": 1630, "loss": 0.7587, "lr": 4.976629207978648e-06, "epoch": 0.44171779141104295, "percentage": 4.42, "elapsed_time": "0:03:28", "remaining_time": "1:15:05"} -{"current_steps": 73, "total_steps": 1630, "loss": 0.58, "lr": 4.975967303880933e-06, "epoch": 0.44785276073619634, "percentage": 4.48, "elapsed_time": "0:03:29", "remaining_time": "1:14:22"} -{"current_steps": 74, "total_steps": 1630, "loss": 0.7253, "lr": 4.975296202293575e-06, "epoch": 0.4539877300613497, "percentage": 4.54, "elapsed_time": "0:03:32", "remaining_time": "1:14:21"} -{"current_steps": 75, "total_steps": 1630, "loss": 0.7352, "lr": 4.974615905709518e-06, "epoch": 0.4601226993865031, "percentage": 4.6, "elapsed_time": "0:03:34", "remaining_time": "1:14:05"} -{"current_steps": 76, "total_steps": 1630, "loss": 1.0643, "lr": 4.973926416655863e-06, "epoch": 0.4662576687116564, "percentage": 4.66, "elapsed_time": "0:03:37", "remaining_time": "1:14:11"} -{"current_steps": 77, "total_steps": 1630, "loss": 0.6699, "lr": 4.973227737693858e-06, "epoch": 0.4723926380368098, "percentage": 4.72, "elapsed_time": "0:03:40", "remaining_time": "1:14:16"} -{"current_steps": 78, "total_steps": 1630, "loss": 1.0315, "lr": 4.972519871418894e-06, "epoch": 0.4785276073619632, "percentage": 4.79, "elapsed_time": "0:03:44", "remaining_time": "1:14:21"} -{"current_steps": 79, "total_steps": 1630, "loss": 0.7003, "lr": 4.971802820460481e-06, "epoch": 0.48466257668711654, "percentage": 4.85, "elapsed_time": "0:03:49", "remaining_time": "1:14:57"} -{"current_steps": 80, "total_steps": 1630, "loss": 0.6776, "lr": 4.971076587482254e-06, "epoch": 0.49079754601226994, "percentage": 4.91, "elapsed_time": "0:03:52", "remaining_time": "1:15:06"} -{"current_steps": 81, "total_steps": 1630, "loss": 0.7422, "lr": 4.970341175181957e-06, "epoch": 0.49693251533742333, "percentage": 4.97, "elapsed_time": "0:03:54", "remaining_time": "1:14:35"} -{"current_steps": 82, "total_steps": 1630, "loss": 0.7471, "lr": 4.969596586291425e-06, "epoch": 0.5030674846625767, "percentage": 5.03, "elapsed_time": "0:03:56", "remaining_time": "1:14:31"} -{"current_steps": 83, "total_steps": 1630, "loss": 0.8111, "lr": 4.968842823576592e-06, "epoch": 0.50920245398773, "percentage": 5.09, "elapsed_time": "0:03:58", "remaining_time": "1:13:57"} -{"current_steps": 84, "total_steps": 1630, "loss": 0.9965, "lr": 4.968079889837461e-06, "epoch": 0.5153374233128835, "percentage": 5.15, "elapsed_time": "0:04:01", "remaining_time": "1:14:01"} -{"current_steps": 85, "total_steps": 1630, "loss": 0.6833, "lr": 4.967307787908108e-06, "epoch": 0.5214723926380368, "percentage": 5.21, "elapsed_time": "0:04:05", "remaining_time": "1:14:14"} -{"current_steps": 86, "total_steps": 1630, "loss": 0.8373, "lr": 4.966526520656663e-06, "epoch": 0.5276073619631901, "percentage": 5.28, "elapsed_time": "0:04:07", "remaining_time": "1:14:02"} -{"current_steps": 87, "total_steps": 1630, "loss": 0.8529, "lr": 4.965736090985305e-06, "epoch": 0.5337423312883436, "percentage": 5.34, "elapsed_time": "0:04:09", "remaining_time": "1:13:49"} -{"current_steps": 88, "total_steps": 1630, "loss": 0.6577, "lr": 4.964936501830246e-06, "epoch": 0.5398773006134969, "percentage": 5.4, "elapsed_time": "0:04:12", "remaining_time": "1:13:49"} -{"current_steps": 89, "total_steps": 1630, "loss": 1.1184, "lr": 4.964127756161727e-06, "epoch": 0.5460122699386503, "percentage": 5.46, "elapsed_time": "0:04:16", "remaining_time": "1:13:54"} -{"current_steps": 90, "total_steps": 1630, "loss": 0.7906, "lr": 4.963309856983998e-06, "epoch": 0.5521472392638037, "percentage": 5.52, "elapsed_time": "0:04:18", "remaining_time": "1:13:34"} -{"current_steps": 91, "total_steps": 1630, "loss": 0.8107, "lr": 4.9624828073353144e-06, "epoch": 0.558282208588957, "percentage": 5.58, "elapsed_time": "0:04:20", "remaining_time": "1:13:18"} -{"current_steps": 92, "total_steps": 1630, "loss": 0.7421, "lr": 4.961646610287922e-06, "epoch": 0.5644171779141104, "percentage": 5.64, "elapsed_time": "0:04:23", "remaining_time": "1:13:17"} -{"current_steps": 93, "total_steps": 1630, "loss": 0.7134, "lr": 4.960801268948047e-06, "epoch": 0.5705521472392638, "percentage": 5.71, "elapsed_time": "0:04:24", "remaining_time": "1:12:55"} -{"current_steps": 94, "total_steps": 1630, "loss": 0.5875, "lr": 4.959946786455882e-06, "epoch": 0.5766871165644172, "percentage": 5.77, "elapsed_time": "0:04:26", "remaining_time": "1:12:30"} -{"current_steps": 95, "total_steps": 1630, "loss": 0.6595, "lr": 4.959083165985581e-06, "epoch": 0.5828220858895705, "percentage": 5.83, "elapsed_time": "0:04:28", "remaining_time": "1:12:18"} -{"current_steps": 96, "total_steps": 1630, "loss": 0.793, "lr": 4.958210410745237e-06, "epoch": 0.588957055214724, "percentage": 5.89, "elapsed_time": "0:04:30", "remaining_time": "1:12:04"} -{"current_steps": 97, "total_steps": 1630, "loss": 0.5896, "lr": 4.957328523976879e-06, "epoch": 0.5950920245398773, "percentage": 5.95, "elapsed_time": "0:04:33", "remaining_time": "1:11:56"} -{"current_steps": 98, "total_steps": 1630, "loss": 0.8658, "lr": 4.956437508956458e-06, "epoch": 0.6012269938650306, "percentage": 6.01, "elapsed_time": "0:04:39", "remaining_time": "1:12:46"} -{"current_steps": 99, "total_steps": 1630, "loss": 0.8316, "lr": 4.9555373689938325e-06, "epoch": 0.6073619631901841, "percentage": 6.07, "elapsed_time": "0:04:45", "remaining_time": "1:13:28"} -{"current_steps": 100, "total_steps": 1630, "loss": 1.0613, "lr": 4.954628107432757e-06, "epoch": 0.6134969325153374, "percentage": 6.13, "elapsed_time": "0:04:51", "remaining_time": "1:14:21"} -{"current_steps": 101, "total_steps": 1630, "loss": 0.7194, "lr": 4.95370972765087e-06, "epoch": 0.6196319018404908, "percentage": 6.2, "elapsed_time": "0:04:53", "remaining_time": "1:13:57"} -{"current_steps": 102, "total_steps": 1630, "loss": 0.5927, "lr": 4.952782233059683e-06, "epoch": 0.6257668711656442, "percentage": 6.26, "elapsed_time": "0:04:55", "remaining_time": "1:13:45"} -{"current_steps": 103, "total_steps": 1630, "loss": 0.8505, "lr": 4.951845627104565e-06, "epoch": 0.6319018404907976, "percentage": 6.32, "elapsed_time": "0:04:57", "remaining_time": "1:13:31"} -{"current_steps": 104, "total_steps": 1630, "loss": 0.8682, "lr": 4.95089991326473e-06, "epoch": 0.6380368098159509, "percentage": 6.38, "elapsed_time": "0:05:02", "remaining_time": "1:13:54"} -{"current_steps": 105, "total_steps": 1630, "loss": 0.8735, "lr": 4.9499450950532305e-06, "epoch": 0.6441717791411042, "percentage": 6.44, "elapsed_time": "0:05:04", "remaining_time": "1:13:46"} -{"current_steps": 106, "total_steps": 1630, "loss": 1.0571, "lr": 4.94898117601693e-06, "epoch": 0.6503067484662577, "percentage": 6.5, "elapsed_time": "0:05:08", "remaining_time": "1:13:58"} -{"current_steps": 107, "total_steps": 1630, "loss": 0.7831, "lr": 4.948008159736507e-06, "epoch": 0.656441717791411, "percentage": 6.56, "elapsed_time": "0:05:14", "remaining_time": "1:14:37"} -{"current_steps": 108, "total_steps": 1630, "loss": 0.5968, "lr": 4.94702604982643e-06, "epoch": 0.6625766871165644, "percentage": 6.63, "elapsed_time": "0:05:16", "remaining_time": "1:14:19"} -{"current_steps": 109, "total_steps": 1630, "loss": 0.7504, "lr": 4.9460348499349485e-06, "epoch": 0.6687116564417178, "percentage": 6.69, "elapsed_time": "0:05:19", "remaining_time": "1:14:14"} -{"current_steps": 110, "total_steps": 1630, "loss": 0.6728, "lr": 4.945034563744077e-06, "epoch": 0.6748466257668712, "percentage": 6.75, "elapsed_time": "0:05:20", "remaining_time": "1:13:53"} -{"current_steps": 111, "total_steps": 1630, "loss": 0.609, "lr": 4.944025194969586e-06, "epoch": 0.6809815950920245, "percentage": 6.81, "elapsed_time": "0:05:22", "remaining_time": "1:13:39"} -{"current_steps": 112, "total_steps": 1630, "loss": 0.8713, "lr": 4.9430067473609825e-06, "epoch": 0.6871165644171779, "percentage": 6.87, "elapsed_time": "0:05:24", "remaining_time": "1:13:18"} -{"current_steps": 113, "total_steps": 1630, "loss": 0.8035, "lr": 4.941979224701499e-06, "epoch": 0.6932515337423313, "percentage": 6.93, "elapsed_time": "0:05:27", "remaining_time": "1:13:09"} -{"current_steps": 114, "total_steps": 1630, "loss": 0.9341, "lr": 4.94094263080808e-06, "epoch": 0.6993865030674846, "percentage": 6.99, "elapsed_time": "0:05:32", "remaining_time": "1:13:44"} -{"current_steps": 115, "total_steps": 1630, "loss": 1.1066, "lr": 4.939896969531367e-06, "epoch": 0.7055214723926381, "percentage": 7.06, "elapsed_time": "0:05:36", "remaining_time": "1:13:48"} -{"current_steps": 116, "total_steps": 1630, "loss": 0.853, "lr": 4.938842244755683e-06, "epoch": 0.7116564417177914, "percentage": 7.12, "elapsed_time": "0:05:38", "remaining_time": "1:13:40"} -{"current_steps": 117, "total_steps": 1630, "loss": 0.9116, "lr": 4.937778460399022e-06, "epoch": 0.7177914110429447, "percentage": 7.18, "elapsed_time": "0:05:41", "remaining_time": "1:13:39"} -{"current_steps": 118, "total_steps": 1630, "loss": 0.5888, "lr": 4.936705620413028e-06, "epoch": 0.7239263803680982, "percentage": 7.24, "elapsed_time": "0:05:42", "remaining_time": "1:13:09"} -{"current_steps": 119, "total_steps": 1630, "loss": 0.592, "lr": 4.935623728782986e-06, "epoch": 0.7300613496932515, "percentage": 7.3, "elapsed_time": "0:05:45", "remaining_time": "1:13:12"} -{"current_steps": 120, "total_steps": 1630, "loss": 0.8713, "lr": 4.934532789527805e-06, "epoch": 0.7361963190184049, "percentage": 7.36, "elapsed_time": "0:05:50", "remaining_time": "1:13:29"} -{"current_steps": 121, "total_steps": 1630, "loss": 0.6791, "lr": 4.933432806700004e-06, "epoch": 0.7423312883435583, "percentage": 7.42, "elapsed_time": "0:05:56", "remaining_time": "1:14:05"} -{"current_steps": 122, "total_steps": 1630, "loss": 0.7531, "lr": 4.932323784385693e-06, "epoch": 0.7484662576687117, "percentage": 7.48, "elapsed_time": "0:06:00", "remaining_time": "1:14:18"} -{"current_steps": 123, "total_steps": 1630, "loss": 0.7547, "lr": 4.931205726704566e-06, "epoch": 0.754601226993865, "percentage": 7.55, "elapsed_time": "0:06:04", "remaining_time": "1:14:19"} -{"current_steps": 124, "total_steps": 1630, "loss": 0.7849, "lr": 4.930078637809878e-06, "epoch": 0.7607361963190185, "percentage": 7.61, "elapsed_time": "0:06:06", "remaining_time": "1:14:06"} -{"current_steps": 125, "total_steps": 1630, "loss": 0.7015, "lr": 4.928942521888431e-06, "epoch": 0.7668711656441718, "percentage": 7.67, "elapsed_time": "0:06:08", "remaining_time": "1:13:56"} -{"current_steps": 126, "total_steps": 1630, "loss": 1.0028, "lr": 4.927797383160561e-06, "epoch": 0.7730061349693251, "percentage": 7.73, "elapsed_time": "0:06:10", "remaining_time": "1:13:41"} -{"current_steps": 127, "total_steps": 1630, "loss": 0.602, "lr": 4.926643225880123e-06, "epoch": 0.7791411042944786, "percentage": 7.79, "elapsed_time": "0:06:12", "remaining_time": "1:13:25"} -{"current_steps": 128, "total_steps": 1630, "loss": 0.7473, "lr": 4.925480054334471e-06, "epoch": 0.7852760736196319, "percentage": 7.85, "elapsed_time": "0:06:17", "remaining_time": "1:13:51"} -{"current_steps": 129, "total_steps": 1630, "loss": 1.0573, "lr": 4.924307872844444e-06, "epoch": 0.7914110429447853, "percentage": 7.91, "elapsed_time": "0:06:19", "remaining_time": "1:13:40"} -{"current_steps": 130, "total_steps": 1630, "loss": 0.7399, "lr": 4.923126685764351e-06, "epoch": 0.7975460122699386, "percentage": 7.98, "elapsed_time": "0:06:22", "remaining_time": "1:13:37"} -{"current_steps": 131, "total_steps": 1630, "loss": 0.9548, "lr": 4.921936497481956e-06, "epoch": 0.803680981595092, "percentage": 8.04, "elapsed_time": "0:06:30", "remaining_time": "1:14:24"} -{"current_steps": 132, "total_steps": 1630, "loss": 0.6748, "lr": 4.920737312418456e-06, "epoch": 0.8098159509202454, "percentage": 8.1, "elapsed_time": "0:06:32", "remaining_time": "1:14:13"} -{"current_steps": 133, "total_steps": 1630, "loss": 0.8431, "lr": 4.919529135028473e-06, "epoch": 0.8159509202453987, "percentage": 8.16, "elapsed_time": "0:06:38", "remaining_time": "1:14:49"} -{"current_steps": 134, "total_steps": 1630, "loss": 0.7243, "lr": 4.918311969800027e-06, "epoch": 0.8220858895705522, "percentage": 8.22, "elapsed_time": "0:06:40", "remaining_time": "1:14:29"} -{"current_steps": 135, "total_steps": 1630, "loss": 0.7845, "lr": 4.917085821254532e-06, "epoch": 0.8282208588957055, "percentage": 8.28, "elapsed_time": "0:06:43", "remaining_time": "1:14:23"} -{"current_steps": 136, "total_steps": 1630, "loss": 0.4891, "lr": 4.915850693946766e-06, "epoch": 0.8343558282208589, "percentage": 8.34, "elapsed_time": "0:06:45", "remaining_time": "1:14:19"} -{"current_steps": 137, "total_steps": 1630, "loss": 0.7917, "lr": 4.914606592464865e-06, "epoch": 0.8404907975460123, "percentage": 8.4, "elapsed_time": "0:06:47", "remaining_time": "1:13:58"} -{"current_steps": 138, "total_steps": 1630, "loss": 0.9681, "lr": 4.9133535214303e-06, "epoch": 0.8466257668711656, "percentage": 8.47, "elapsed_time": "0:06:48", "remaining_time": "1:13:32"} -{"current_steps": 139, "total_steps": 1630, "loss": 0.9275, "lr": 4.91209148549786e-06, "epoch": 0.852760736196319, "percentage": 8.53, "elapsed_time": "0:06:51", "remaining_time": "1:13:39"} -{"current_steps": 140, "total_steps": 1630, "loss": 0.7259, "lr": 4.910820489355637e-06, "epoch": 0.8588957055214724, "percentage": 8.59, "elapsed_time": "0:06:55", "remaining_time": "1:13:38"} -{"current_steps": 141, "total_steps": 1630, "loss": 0.6061, "lr": 4.909540537725007e-06, "epoch": 0.8650306748466258, "percentage": 8.65, "elapsed_time": "0:06:56", "remaining_time": "1:13:20"} -{"current_steps": 142, "total_steps": 1630, "loss": 1.0559, "lr": 4.908251635360616e-06, "epoch": 0.8711656441717791, "percentage": 8.71, "elapsed_time": "0:06:58", "remaining_time": "1:13:04"} -{"current_steps": 143, "total_steps": 1630, "loss": 0.7372, "lr": 4.906953787050354e-06, "epoch": 0.8773006134969326, "percentage": 8.77, "elapsed_time": "0:07:00", "remaining_time": "1:12:50"} -{"current_steps": 144, "total_steps": 1630, "loss": 0.6234, "lr": 4.905646997615347e-06, "epoch": 0.8834355828220859, "percentage": 8.83, "elapsed_time": "0:07:02", "remaining_time": "1:12:36"} -{"current_steps": 145, "total_steps": 1630, "loss": 0.8066, "lr": 4.904331271909932e-06, "epoch": 0.8895705521472392, "percentage": 8.9, "elapsed_time": "0:07:06", "remaining_time": "1:12:49"} -{"current_steps": 146, "total_steps": 1630, "loss": 0.6861, "lr": 4.903006614821645e-06, "epoch": 0.8957055214723927, "percentage": 8.96, "elapsed_time": "0:07:07", "remaining_time": "1:12:30"} -{"current_steps": 147, "total_steps": 1630, "loss": 0.6112, "lr": 4.901673031271194e-06, "epoch": 0.901840490797546, "percentage": 9.02, "elapsed_time": "0:07:10", "remaining_time": "1:12:22"} -{"current_steps": 148, "total_steps": 1630, "loss": 0.6314, "lr": 4.900330526212451e-06, "epoch": 0.9079754601226994, "percentage": 9.08, "elapsed_time": "0:07:11", "remaining_time": "1:12:00"} -{"current_steps": 149, "total_steps": 1630, "loss": 0.889, "lr": 4.898979104632427e-06, "epoch": 0.9141104294478528, "percentage": 9.14, "elapsed_time": "0:07:13", "remaining_time": "1:11:47"} -{"current_steps": 150, "total_steps": 1630, "loss": 0.6406, "lr": 4.897618771551255e-06, "epoch": 0.9202453987730062, "percentage": 9.2, "elapsed_time": "0:07:15", "remaining_time": "1:11:32"} -{"current_steps": 151, "total_steps": 1630, "loss": 0.6368, "lr": 4.8962495320221714e-06, "epoch": 0.9263803680981595, "percentage": 9.26, "elapsed_time": "0:07:18", "remaining_time": "1:11:38"} -{"current_steps": 152, "total_steps": 1630, "loss": 0.8642, "lr": 4.8948713911315e-06, "epoch": 0.9325153374233128, "percentage": 9.33, "elapsed_time": "0:07:20", "remaining_time": "1:11:27"} -{"current_steps": 153, "total_steps": 1630, "loss": 0.714, "lr": 4.8934843539986266e-06, "epoch": 0.9386503067484663, "percentage": 9.39, "elapsed_time": "0:07:23", "remaining_time": "1:11:17"} -{"current_steps": 154, "total_steps": 1630, "loss": 0.8365, "lr": 4.892088425775986e-06, "epoch": 0.9447852760736196, "percentage": 9.45, "elapsed_time": "0:07:25", "remaining_time": "1:11:07"} -{"current_steps": 155, "total_steps": 1630, "loss": 0.7937, "lr": 4.890683611649041e-06, "epoch": 0.950920245398773, "percentage": 9.51, "elapsed_time": "0:07:26", "remaining_time": "1:10:52"} -{"current_steps": 156, "total_steps": 1630, "loss": 0.7485, "lr": 4.8892699168362626e-06, "epoch": 0.9570552147239264, "percentage": 9.57, "elapsed_time": "0:07:29", "remaining_time": "1:10:49"} -{"current_steps": 157, "total_steps": 1630, "loss": 0.6467, "lr": 4.887847346589111e-06, "epoch": 0.9631901840490797, "percentage": 9.63, "elapsed_time": "0:07:31", "remaining_time": "1:10:36"} -{"current_steps": 158, "total_steps": 1630, "loss": 0.4651, "lr": 4.886415906192015e-06, "epoch": 0.9693251533742331, "percentage": 9.69, "elapsed_time": "0:07:33", "remaining_time": "1:10:23"} -{"current_steps": 159, "total_steps": 1630, "loss": 0.8756, "lr": 4.884975600962355e-06, "epoch": 0.9754601226993865, "percentage": 9.75, "elapsed_time": "0:07:34", "remaining_time": "1:10:08"} -{"current_steps": 160, "total_steps": 1630, "loss": 0.7339, "lr": 4.883526436250441e-06, "epoch": 0.9815950920245399, "percentage": 9.82, "elapsed_time": "0:07:35", "remaining_time": "1:09:48"} -{"current_steps": 161, "total_steps": 1630, "loss": 0.7808, "lr": 4.8820684174394935e-06, "epoch": 0.9877300613496932, "percentage": 9.88, "elapsed_time": "0:07:37", "remaining_time": "1:09:33"} -{"current_steps": 162, "total_steps": 1630, "loss": 0.96, "lr": 4.880601549945622e-06, "epoch": 0.9938650306748467, "percentage": 9.94, "elapsed_time": "0:07:41", "remaining_time": "1:09:39"} -{"current_steps": 163, "total_steps": 1630, "loss": 0.8122, "lr": 4.879125839217808e-06, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "0:07:44", "remaining_time": "1:09:37"} -{"current_steps": 164, "total_steps": 1630, "loss": 0.7307, "lr": 4.8776412907378845e-06, "epoch": 1.0061349693251533, "percentage": 10.06, "elapsed_time": "0:12:54", "remaining_time": "1:55:22"} -{"current_steps": 165, "total_steps": 1630, "loss": 0.7554, "lr": 4.8761479100205085e-06, "epoch": 1.0122699386503067, "percentage": 10.12, "elapsed_time": "0:12:58", "remaining_time": "1:55:10"} -{"current_steps": 166, "total_steps": 1630, "loss": 0.4372, "lr": 4.874645702613152e-06, "epoch": 1.01840490797546, "percentage": 10.18, "elapsed_time": "0:13:01", "remaining_time": "1:54:52"} -{"current_steps": 167, "total_steps": 1630, "loss": 0.3597, "lr": 4.873134674096072e-06, "epoch": 1.0245398773006136, "percentage": 10.25, "elapsed_time": "0:13:03", "remaining_time": "1:54:22"} -{"current_steps": 168, "total_steps": 1630, "loss": 0.5854, "lr": 4.871614830082297e-06, "epoch": 1.030674846625767, "percentage": 10.31, "elapsed_time": "0:13:06", "remaining_time": "1:54:02"} -{"current_steps": 169, "total_steps": 1630, "loss": 0.7978, "lr": 4.870086176217597e-06, "epoch": 1.0368098159509203, "percentage": 10.37, "elapsed_time": "0:13:07", "remaining_time": "1:53:27"} -{"current_steps": 170, "total_steps": 1630, "loss": 0.5593, "lr": 4.868548718180473e-06, "epoch": 1.0429447852760736, "percentage": 10.43, "elapsed_time": "0:13:10", "remaining_time": "1:53:10"} -{"current_steps": 171, "total_steps": 1630, "loss": 0.4083, "lr": 4.867002461682129e-06, "epoch": 1.049079754601227, "percentage": 10.49, "elapsed_time": "0:13:11", "remaining_time": "1:52:37"} -{"current_steps": 172, "total_steps": 1630, "loss": 0.4752, "lr": 4.8654474124664505e-06, "epoch": 1.0552147239263803, "percentage": 10.55, "elapsed_time": "0:13:15", "remaining_time": "1:52:26"} -{"current_steps": 173, "total_steps": 1630, "loss": 0.7435, "lr": 4.863883576309991e-06, "epoch": 1.0613496932515338, "percentage": 10.61, "elapsed_time": "0:13:23", "remaining_time": "1:52:44"} -{"current_steps": 174, "total_steps": 1630, "loss": 0.4612, "lr": 4.8623109590219395e-06, "epoch": 1.0674846625766872, "percentage": 10.67, "elapsed_time": "0:13:24", "remaining_time": "1:52:14"} -{"current_steps": 175, "total_steps": 1630, "loss": 0.4644, "lr": 4.860729566444106e-06, "epoch": 1.0736196319018405, "percentage": 10.74, "elapsed_time": "0:13:27", "remaining_time": "1:51:54"} -{"current_steps": 176, "total_steps": 1630, "loss": 0.4852, "lr": 4.8591394044508985e-06, "epoch": 1.0797546012269938, "percentage": 10.8, "elapsed_time": "0:13:29", "remaining_time": "1:51:26"} -{"current_steps": 177, "total_steps": 1630, "loss": 0.4574, "lr": 4.857540478949302e-06, "epoch": 1.0858895705521472, "percentage": 10.86, "elapsed_time": "0:13:31", "remaining_time": "1:51:02"} -{"current_steps": 178, "total_steps": 1630, "loss": 0.8095, "lr": 4.855932795878852e-06, "epoch": 1.0920245398773005, "percentage": 10.92, "elapsed_time": "0:13:33", "remaining_time": "1:50:36"} -{"current_steps": 179, "total_steps": 1630, "loss": 0.4578, "lr": 4.854316361211619e-06, "epoch": 1.098159509202454, "percentage": 10.98, "elapsed_time": "0:13:35", "remaining_time": "1:50:10"} -{"current_steps": 180, "total_steps": 1630, "loss": 0.5473, "lr": 4.852691180952183e-06, "epoch": 1.1042944785276074, "percentage": 11.04, "elapsed_time": "0:13:37", "remaining_time": "1:49:46"} -{"current_steps": 181, "total_steps": 1630, "loss": 0.4313, "lr": 4.851057261137608e-06, "epoch": 1.1104294478527608, "percentage": 11.1, "elapsed_time": "0:13:38", "remaining_time": "1:49:12"} -{"current_steps": 182, "total_steps": 1630, "loss": 0.4197, "lr": 4.8494146078374274e-06, "epoch": 1.116564417177914, "percentage": 11.17, "elapsed_time": "0:13:40", "remaining_time": "1:48:44"} -{"current_steps": 183, "total_steps": 1630, "loss": 0.5865, "lr": 4.847763227153612e-06, "epoch": 1.1226993865030674, "percentage": 11.23, "elapsed_time": "0:13:45", "remaining_time": "1:48:47"} -{"current_steps": 184, "total_steps": 1630, "loss": 0.3866, "lr": 4.846103125220557e-06, "epoch": 1.1288343558282208, "percentage": 11.29, "elapsed_time": "0:13:49", "remaining_time": "1:48:34"} -{"current_steps": 185, "total_steps": 1630, "loss": 0.5357, "lr": 4.844434308205052e-06, "epoch": 1.1349693251533743, "percentage": 11.35, "elapsed_time": "0:13:50", "remaining_time": "1:48:09"} -{"current_steps": 186, "total_steps": 1630, "loss": 0.4783, "lr": 4.842756782306261e-06, "epoch": 1.1411042944785277, "percentage": 11.41, "elapsed_time": "0:13:54", "remaining_time": "1:47:55"} -{"current_steps": 187, "total_steps": 1630, "loss": 0.3733, "lr": 4.841070553755697e-06, "epoch": 1.147239263803681, "percentage": 11.47, "elapsed_time": "0:13:57", "remaining_time": "1:47:42"} -{"current_steps": 188, "total_steps": 1630, "loss": 0.6039, "lr": 4.839375628817205e-06, "epoch": 1.1533742331288344, "percentage": 11.53, "elapsed_time": "0:14:00", "remaining_time": "1:47:23"} -{"current_steps": 189, "total_steps": 1630, "loss": 0.5372, "lr": 4.837672013786931e-06, "epoch": 1.1595092024539877, "percentage": 11.6, "elapsed_time": "0:14:03", "remaining_time": "1:47:13"} -{"current_steps": 190, "total_steps": 1630, "loss": 0.5162, "lr": 4.835959714993305e-06, "epoch": 1.165644171779141, "percentage": 11.66, "elapsed_time": "0:14:06", "remaining_time": "1:46:53"} -{"current_steps": 191, "total_steps": 1630, "loss": 0.4537, "lr": 4.8342387387970105e-06, "epoch": 1.1717791411042944, "percentage": 11.72, "elapsed_time": "0:14:07", "remaining_time": "1:46:27"} -{"current_steps": 192, "total_steps": 1630, "loss": 0.6165, "lr": 4.832509091590968e-06, "epoch": 1.177914110429448, "percentage": 11.78, "elapsed_time": "0:14:09", "remaining_time": "1:46:02"} -{"current_steps": 193, "total_steps": 1630, "loss": 0.7475, "lr": 4.830770779800309e-06, "epoch": 1.1840490797546013, "percentage": 11.84, "elapsed_time": "0:14:11", "remaining_time": "1:45:38"} -{"current_steps": 194, "total_steps": 1630, "loss": 0.4629, "lr": 4.829023809882349e-06, "epoch": 1.1901840490797546, "percentage": 11.9, "elapsed_time": "0:14:13", "remaining_time": "1:45:20"} -{"current_steps": 195, "total_steps": 1630, "loss": 0.5208, "lr": 4.827268188326567e-06, "epoch": 1.196319018404908, "percentage": 11.96, "elapsed_time": "0:14:16", "remaining_time": "1:45:05"} -{"current_steps": 196, "total_steps": 1630, "loss": 0.6521, "lr": 4.825503921654582e-06, "epoch": 1.2024539877300613, "percentage": 12.02, "elapsed_time": "0:14:19", "remaining_time": "1:44:45"} -{"current_steps": 197, "total_steps": 1630, "loss": 0.7491, "lr": 4.823731016420122e-06, "epoch": 1.2085889570552146, "percentage": 12.09, "elapsed_time": "0:14:20", "remaining_time": "1:44:19"} -{"current_steps": 198, "total_steps": 1630, "loss": 0.3866, "lr": 4.821949479209011e-06, "epoch": 1.2147239263803682, "percentage": 12.15, "elapsed_time": "0:14:22", "remaining_time": "1:43:58"} -{"current_steps": 199, "total_steps": 1630, "loss": 0.499, "lr": 4.820159316639133e-06, "epoch": 1.2208588957055215, "percentage": 12.21, "elapsed_time": "0:14:25", "remaining_time": "1:43:46"} -{"current_steps": 200, "total_steps": 1630, "loss": 0.556, "lr": 4.818360535360418e-06, "epoch": 1.2269938650306749, "percentage": 12.27, "elapsed_time": "0:14:29", "remaining_time": "1:43:39"} -{"current_steps": 201, "total_steps": 1630, "loss": 0.3433, "lr": 4.816553142054806e-06, "epoch": 1.2331288343558282, "percentage": 12.33, "elapsed_time": "0:14:32", "remaining_time": "1:43:20"} -{"current_steps": 202, "total_steps": 1630, "loss": 0.8808, "lr": 4.814737143436232e-06, "epoch": 1.2392638036809815, "percentage": 12.39, "elapsed_time": "0:14:35", "remaining_time": "1:43:12"} -{"current_steps": 203, "total_steps": 1630, "loss": 0.5718, "lr": 4.812912546250595e-06, "epoch": 1.2453987730061349, "percentage": 12.45, "elapsed_time": "0:14:38", "remaining_time": "1:42:53"} -{"current_steps": 204, "total_steps": 1630, "loss": 0.9743, "lr": 4.81107935727574e-06, "epoch": 1.2515337423312882, "percentage": 12.52, "elapsed_time": "0:14:39", "remaining_time": "1:42:29"} -{"current_steps": 205, "total_steps": 1630, "loss": 0.2821, "lr": 4.809237583321421e-06, "epoch": 1.2576687116564418, "percentage": 12.58, "elapsed_time": "0:14:42", "remaining_time": "1:42:14"} -{"current_steps": 206, "total_steps": 1630, "loss": 0.7524, "lr": 4.807387231229287e-06, "epoch": 1.2638036809815951, "percentage": 12.64, "elapsed_time": "0:14:43", "remaining_time": "1:41:49"} -{"current_steps": 207, "total_steps": 1630, "loss": 0.4304, "lr": 4.8055283078728525e-06, "epoch": 1.2699386503067485, "percentage": 12.7, "elapsed_time": "0:14:45", "remaining_time": "1:41:25"} -{"current_steps": 208, "total_steps": 1630, "loss": 0.6986, "lr": 4.803660820157468e-06, "epoch": 1.2760736196319018, "percentage": 12.76, "elapsed_time": "0:14:48", "remaining_time": "1:41:11"} -{"current_steps": 209, "total_steps": 1630, "loss": 0.7112, "lr": 4.801784775020303e-06, "epoch": 1.2822085889570551, "percentage": 12.82, "elapsed_time": "0:14:51", "remaining_time": "1:41:03"} -{"current_steps": 210, "total_steps": 1630, "loss": 0.4125, "lr": 4.799900179430312e-06, "epoch": 1.2883435582822087, "percentage": 12.88, "elapsed_time": "0:14:53", "remaining_time": "1:40:44"} -{"current_steps": 211, "total_steps": 1630, "loss": 0.7057, "lr": 4.798007040388212e-06, "epoch": 1.294478527607362, "percentage": 12.94, "elapsed_time": "0:14:55", "remaining_time": "1:40:22"} -{"current_steps": 212, "total_steps": 1630, "loss": 0.708, "lr": 4.7961053649264585e-06, "epoch": 1.3006134969325154, "percentage": 13.01, "elapsed_time": "0:14:59", "remaining_time": "1:40:15"} -{"current_steps": 213, "total_steps": 1630, "loss": 0.7608, "lr": 4.794195160109215e-06, "epoch": 1.3067484662576687, "percentage": 13.07, "elapsed_time": "0:15:02", "remaining_time": "1:40:06"} -{"current_steps": 214, "total_steps": 1630, "loss": 0.4779, "lr": 4.7922764330323315e-06, "epoch": 1.312883435582822, "percentage": 13.13, "elapsed_time": "0:15:05", "remaining_time": "1:39:54"} -{"current_steps": 215, "total_steps": 1630, "loss": 0.5464, "lr": 4.790349190823313e-06, "epoch": 1.3190184049079754, "percentage": 13.19, "elapsed_time": "0:15:07", "remaining_time": "1:39:30"} -{"current_steps": 216, "total_steps": 1630, "loss": 0.6198, "lr": 4.788413440641297e-06, "epoch": 1.3251533742331287, "percentage": 13.25, "elapsed_time": "0:15:09", "remaining_time": "1:39:11"} -{"current_steps": 217, "total_steps": 1630, "loss": 0.6695, "lr": 4.786469189677026e-06, "epoch": 1.331288343558282, "percentage": 13.31, "elapsed_time": "0:15:10", "remaining_time": "1:38:50"} -{"current_steps": 218, "total_steps": 1630, "loss": 0.4902, "lr": 4.784516445152821e-06, "epoch": 1.3374233128834356, "percentage": 13.37, "elapsed_time": "0:15:12", "remaining_time": "1:38:28"} -{"current_steps": 219, "total_steps": 1630, "loss": 0.7411, "lr": 4.78255521432255e-06, "epoch": 1.343558282208589, "percentage": 13.44, "elapsed_time": "0:15:18", "remaining_time": "1:38:35"} -{"current_steps": 220, "total_steps": 1630, "loss": 0.8767, "lr": 4.780585504471612e-06, "epoch": 1.3496932515337423, "percentage": 13.5, "elapsed_time": "0:15:23", "remaining_time": "1:38:40"} -{"current_steps": 221, "total_steps": 1630, "loss": 0.4266, "lr": 4.778607322916896e-06, "epoch": 1.3558282208588956, "percentage": 13.56, "elapsed_time": "0:15:26", "remaining_time": "1:38:23"} -{"current_steps": 222, "total_steps": 1630, "loss": 0.4982, "lr": 4.776620677006766e-06, "epoch": 1.3619631901840492, "percentage": 13.62, "elapsed_time": "0:15:28", "remaining_time": "1:38:06"} -{"current_steps": 223, "total_steps": 1630, "loss": 0.6012, "lr": 4.7746255741210256e-06, "epoch": 1.3680981595092025, "percentage": 13.68, "elapsed_time": "0:15:32", "remaining_time": "1:38:03"} -{"current_steps": 224, "total_steps": 1630, "loss": 0.7585, "lr": 4.772622021670897e-06, "epoch": 1.3742331288343559, "percentage": 13.74, "elapsed_time": "0:15:39", "remaining_time": "1:38:16"} -{"current_steps": 225, "total_steps": 1630, "loss": 0.5266, "lr": 4.770610027098983e-06, "epoch": 1.3803680981595092, "percentage": 13.8, "elapsed_time": "0:15:40", "remaining_time": "1:37:54"} -{"current_steps": 226, "total_steps": 1630, "loss": 0.6261, "lr": 4.7685895978792564e-06, "epoch": 1.3865030674846626, "percentage": 13.87, "elapsed_time": "0:15:47", "remaining_time": "1:38:05"} -{"current_steps": 227, "total_steps": 1630, "loss": 0.7081, "lr": 4.766560741517014e-06, "epoch": 1.392638036809816, "percentage": 13.93, "elapsed_time": "0:15:49", "remaining_time": "1:37:49"} -{"current_steps": 228, "total_steps": 1630, "loss": 0.5041, "lr": 4.76452346554886e-06, "epoch": 1.3987730061349692, "percentage": 13.99, "elapsed_time": "0:15:51", "remaining_time": "1:37:31"} -{"current_steps": 229, "total_steps": 1630, "loss": 0.49, "lr": 4.762477777542676e-06, "epoch": 1.4049079754601226, "percentage": 14.05, "elapsed_time": "0:15:53", "remaining_time": "1:37:15"} -{"current_steps": 230, "total_steps": 1630, "loss": 0.7056, "lr": 4.7604236850975905e-06, "epoch": 1.4110429447852761, "percentage": 14.11, "elapsed_time": "0:15:59", "remaining_time": "1:37:22"} -{"current_steps": 231, "total_steps": 1630, "loss": 0.7762, "lr": 4.7583611958439514e-06, "epoch": 1.4171779141104295, "percentage": 14.17, "elapsed_time": "0:16:05", "remaining_time": "1:37:28"} -{"current_steps": 232, "total_steps": 1630, "loss": 0.5347, "lr": 4.7562903174433e-06, "epoch": 1.4233128834355828, "percentage": 14.23, "elapsed_time": "0:16:07", "remaining_time": "1:37:07"} -{"current_steps": 233, "total_steps": 1630, "loss": 0.503, "lr": 4.75421105758834e-06, "epoch": 1.4294478527607362, "percentage": 14.29, "elapsed_time": "0:16:11", "remaining_time": "1:37:02"} -{"current_steps": 234, "total_steps": 1630, "loss": 0.5081, "lr": 4.752123424002908e-06, "epoch": 1.4355828220858895, "percentage": 14.36, "elapsed_time": "0:16:13", "remaining_time": "1:36:48"} -{"current_steps": 235, "total_steps": 1630, "loss": 0.7523, "lr": 4.750027424441949e-06, "epoch": 1.441717791411043, "percentage": 14.42, "elapsed_time": "0:16:16", "remaining_time": "1:36:36"} -{"current_steps": 236, "total_steps": 1630, "loss": 0.5575, "lr": 4.747923066691487e-06, "epoch": 1.4478527607361964, "percentage": 14.48, "elapsed_time": "0:16:18", "remaining_time": "1:36:21"} -{"current_steps": 237, "total_steps": 1630, "loss": 0.7264, "lr": 4.745810358568588e-06, "epoch": 1.4539877300613497, "percentage": 14.54, "elapsed_time": "0:16:20", "remaining_time": "1:36:04"} -{"current_steps": 238, "total_steps": 1630, "loss": 0.4545, "lr": 4.743689307921342e-06, "epoch": 1.460122699386503, "percentage": 14.6, "elapsed_time": "0:16:25", "remaining_time": "1:36:04"} -{"current_steps": 239, "total_steps": 1630, "loss": 0.5429, "lr": 4.741559922628828e-06, "epoch": 1.4662576687116564, "percentage": 14.66, "elapsed_time": "0:16:27", "remaining_time": "1:35:45"} -{"current_steps": 240, "total_steps": 1630, "loss": 0.5086, "lr": 4.739422210601085e-06, "epoch": 1.4723926380368098, "percentage": 14.72, "elapsed_time": "0:16:28", "remaining_time": "1:35:26"} -{"current_steps": 241, "total_steps": 1630, "loss": 0.6109, "lr": 4.7372761797790836e-06, "epoch": 1.478527607361963, "percentage": 14.79, "elapsed_time": "0:16:34", "remaining_time": "1:35:33"} -{"current_steps": 242, "total_steps": 1630, "loss": 0.4317, "lr": 4.735121838134697e-06, "epoch": 1.4846625766871164, "percentage": 14.85, "elapsed_time": "0:16:37", "remaining_time": "1:35:20"} -{"current_steps": 243, "total_steps": 1630, "loss": 0.6414, "lr": 4.732959193670672e-06, "epoch": 1.49079754601227, "percentage": 14.91, "elapsed_time": "0:16:40", "remaining_time": "1:35:11"} -{"current_steps": 244, "total_steps": 1630, "loss": 0.5166, "lr": 4.730788254420593e-06, "epoch": 1.4969325153374233, "percentage": 14.97, "elapsed_time": "0:16:43", "remaining_time": "1:34:59"} -{"current_steps": 245, "total_steps": 1630, "loss": 0.4982, "lr": 4.728609028448862e-06, "epoch": 1.5030674846625767, "percentage": 15.03, "elapsed_time": "0:16:47", "remaining_time": "1:34:53"} -{"current_steps": 246, "total_steps": 1630, "loss": 0.7552, "lr": 4.726421523850662e-06, "epoch": 1.50920245398773, "percentage": 15.09, "elapsed_time": "0:16:51", "remaining_time": "1:34:48"} -{"current_steps": 247, "total_steps": 1630, "loss": 0.4365, "lr": 4.7242257487519275e-06, "epoch": 1.5153374233128836, "percentage": 15.15, "elapsed_time": "0:16:53", "remaining_time": "1:34:34"} -{"current_steps": 248, "total_steps": 1630, "loss": 0.6002, "lr": 4.722021711309317e-06, "epoch": 1.521472392638037, "percentage": 15.21, "elapsed_time": "0:16:55", "remaining_time": "1:34:16"} -{"current_steps": 249, "total_steps": 1630, "loss": 0.4993, "lr": 4.7198094197101826e-06, "epoch": 1.5276073619631902, "percentage": 15.28, "elapsed_time": "0:16:58", "remaining_time": "1:34:08"} -{"current_steps": 250, "total_steps": 1630, "loss": 0.4637, "lr": 4.7175888821725335e-06, "epoch": 1.5337423312883436, "percentage": 15.34, "elapsed_time": "0:17:03", "remaining_time": "1:34:07"} -{"current_steps": 251, "total_steps": 1630, "loss": 0.9711, "lr": 4.715360106945015e-06, "epoch": 1.539877300613497, "percentage": 15.4, "elapsed_time": "0:17:05", "remaining_time": "1:33:55"} -{"current_steps": 252, "total_steps": 1630, "loss": 0.5452, "lr": 4.713123102306869e-06, "epoch": 1.5460122699386503, "percentage": 15.46, "elapsed_time": "0:17:09", "remaining_time": "1:33:47"} -{"current_steps": 253, "total_steps": 1630, "loss": 0.5034, "lr": 4.710877876567912e-06, "epoch": 1.5521472392638036, "percentage": 15.52, "elapsed_time": "0:17:10", "remaining_time": "1:33:30"} -{"current_steps": 254, "total_steps": 1630, "loss": 0.4236, "lr": 4.708624438068494e-06, "epoch": 1.558282208588957, "percentage": 15.58, "elapsed_time": "0:17:13", "remaining_time": "1:33:16"} -{"current_steps": 255, "total_steps": 1630, "loss": 0.6095, "lr": 4.706362795179476e-06, "epoch": 1.5644171779141103, "percentage": 15.64, "elapsed_time": "0:17:17", "remaining_time": "1:33:14"} -{"current_steps": 256, "total_steps": 1630, "loss": 0.738, "lr": 4.7040929563021975e-06, "epoch": 1.5705521472392638, "percentage": 15.71, "elapsed_time": "0:17:20", "remaining_time": "1:33:06"} -{"current_steps": 257, "total_steps": 1630, "loss": 0.6726, "lr": 4.70181492986844e-06, "epoch": 1.5766871165644172, "percentage": 15.77, "elapsed_time": "0:17:26", "remaining_time": "1:33:13"} -{"current_steps": 258, "total_steps": 1630, "loss": 0.4862, "lr": 4.699528724340401e-06, "epoch": 1.5828220858895705, "percentage": 15.83, "elapsed_time": "0:17:29", "remaining_time": "1:32:59"} -{"current_steps": 259, "total_steps": 1630, "loss": 0.5003, "lr": 4.6972343482106615e-06, "epoch": 1.588957055214724, "percentage": 15.89, "elapsed_time": "0:17:31", "remaining_time": "1:32:46"} -{"current_steps": 260, "total_steps": 1630, "loss": 0.6734, "lr": 4.6949318100021546e-06, "epoch": 1.5950920245398774, "percentage": 15.95, "elapsed_time": "0:17:33", "remaining_time": "1:32:31"} -{"current_steps": 261, "total_steps": 1630, "loss": 0.5639, "lr": 4.6926211182681295e-06, "epoch": 1.6012269938650308, "percentage": 16.01, "elapsed_time": "0:17:34", "remaining_time": "1:32:12"} -{"current_steps": 262, "total_steps": 1630, "loss": 0.7032, "lr": 4.690302281592128e-06, "epoch": 1.607361963190184, "percentage": 16.07, "elapsed_time": "0:17:37", "remaining_time": "1:32:03"} -{"current_steps": 263, "total_steps": 1630, "loss": 0.4937, "lr": 4.687975308587944e-06, "epoch": 1.6134969325153374, "percentage": 16.13, "elapsed_time": "0:17:39", "remaining_time": "1:31:44"} -{"current_steps": 264, "total_steps": 1630, "loss": 0.5829, "lr": 4.685640207899598e-06, "epoch": 1.6196319018404908, "percentage": 16.2, "elapsed_time": "0:17:45", "remaining_time": "1:31:53"} -{"current_steps": 265, "total_steps": 1630, "loss": 0.3805, "lr": 4.683296988201301e-06, "epoch": 1.6257668711656441, "percentage": 16.26, "elapsed_time": "0:17:47", "remaining_time": "1:31:39"} -{"current_steps": 266, "total_steps": 1630, "loss": 0.7939, "lr": 4.680945658197425e-06, "epoch": 1.6319018404907975, "percentage": 16.32, "elapsed_time": "0:17:50", "remaining_time": "1:31:30"} -{"current_steps": 267, "total_steps": 1630, "loss": 0.7511, "lr": 4.6785862266224695e-06, "epoch": 1.6380368098159508, "percentage": 16.38, "elapsed_time": "0:17:54", "remaining_time": "1:31:23"} -{"current_steps": 268, "total_steps": 1630, "loss": 0.8984, "lr": 4.676218702241026e-06, "epoch": 1.6441717791411041, "percentage": 16.44, "elapsed_time": "0:17:56", "remaining_time": "1:31:11"} -{"current_steps": 269, "total_steps": 1630, "loss": 0.5473, "lr": 4.673843093847753e-06, "epoch": 1.6503067484662577, "percentage": 16.5, "elapsed_time": "0:18:00", "remaining_time": "1:31:07"} -{"current_steps": 270, "total_steps": 1630, "loss": 0.6626, "lr": 4.6714594102673355e-06, "epoch": 1.656441717791411, "percentage": 16.56, "elapsed_time": "0:18:03", "remaining_time": "1:31:00"} -{"current_steps": 271, "total_steps": 1630, "loss": 0.5015, "lr": 4.669067660354456e-06, "epoch": 1.6625766871165644, "percentage": 16.63, "elapsed_time": "0:18:04", "remaining_time": "1:30:40"} -{"current_steps": 272, "total_steps": 1630, "loss": 0.5384, "lr": 4.666667852993761e-06, "epoch": 1.668711656441718, "percentage": 16.69, "elapsed_time": "0:18:06", "remaining_time": "1:30:25"} -{"current_steps": 273, "total_steps": 1630, "loss": 0.7491, "lr": 4.664259997099829e-06, "epoch": 1.6748466257668713, "percentage": 16.75, "elapsed_time": "0:18:13", "remaining_time": "1:30:36"} -{"current_steps": 274, "total_steps": 1630, "loss": 0.6451, "lr": 4.661844101617135e-06, "epoch": 1.6809815950920246, "percentage": 16.81, "elapsed_time": "0:18:18", "remaining_time": "1:30:35"} -{"current_steps": 275, "total_steps": 1630, "loss": 0.6299, "lr": 4.6594201755200205e-06, "epoch": 1.687116564417178, "percentage": 16.87, "elapsed_time": "0:18:24", "remaining_time": "1:30:41"} -{"current_steps": 276, "total_steps": 1630, "loss": 0.4477, "lr": 4.656988227812658e-06, "epoch": 1.6932515337423313, "percentage": 16.93, "elapsed_time": "0:18:27", "remaining_time": "1:30:31"} -{"current_steps": 277, "total_steps": 1630, "loss": 0.5473, "lr": 4.654548267529015e-06, "epoch": 1.6993865030674846, "percentage": 16.99, "elapsed_time": "0:18:28", "remaining_time": "1:30:16"} -{"current_steps": 278, "total_steps": 1630, "loss": 0.496, "lr": 4.652100303732827e-06, "epoch": 1.705521472392638, "percentage": 17.06, "elapsed_time": "0:18:31", "remaining_time": "1:30:04"} -{"current_steps": 279, "total_steps": 1630, "loss": 0.932, "lr": 4.64964434551756e-06, "epoch": 1.7116564417177913, "percentage": 17.12, "elapsed_time": "0:18:34", "remaining_time": "1:29:54"} -{"current_steps": 280, "total_steps": 1630, "loss": 0.4648, "lr": 4.647180402006372e-06, "epoch": 1.7177914110429446, "percentage": 17.18, "elapsed_time": "0:18:37", "remaining_time": "1:29:49"} -{"current_steps": 281, "total_steps": 1630, "loss": 0.7237, "lr": 4.644708482352093e-06, "epoch": 1.7239263803680982, "percentage": 17.24, "elapsed_time": "0:18:39", "remaining_time": "1:29:34"} -{"current_steps": 282, "total_steps": 1630, "loss": 0.5531, "lr": 4.6422285957371735e-06, "epoch": 1.7300613496932515, "percentage": 17.3, "elapsed_time": "0:18:41", "remaining_time": "1:29:21"} -{"current_steps": 283, "total_steps": 1630, "loss": 0.6706, "lr": 4.639740751373663e-06, "epoch": 1.7361963190184049, "percentage": 17.36, "elapsed_time": "0:18:45", "remaining_time": "1:29:15"} -{"current_steps": 284, "total_steps": 1630, "loss": 0.56, "lr": 4.63724495850317e-06, "epoch": 1.7423312883435584, "percentage": 17.42, "elapsed_time": "0:18:48", "remaining_time": "1:29:06"} -{"current_steps": 285, "total_steps": 1630, "loss": 0.6138, "lr": 4.634741226396832e-06, "epoch": 1.7484662576687118, "percentage": 17.48, "elapsed_time": "0:18:50", "remaining_time": "1:28:54"} -{"current_steps": 286, "total_steps": 1630, "loss": 0.4908, "lr": 4.632229564355275e-06, "epoch": 1.7546012269938651, "percentage": 17.55, "elapsed_time": "0:18:51", "remaining_time": "1:28:39"} -{"current_steps": 287, "total_steps": 1630, "loss": 0.8181, "lr": 4.629709981708586e-06, "epoch": 1.7607361963190185, "percentage": 17.61, "elapsed_time": "0:18:53", "remaining_time": "1:28:24"} -{"current_steps": 288, "total_steps": 1630, "loss": 0.5625, "lr": 4.6271824878162704e-06, "epoch": 1.7668711656441718, "percentage": 17.67, "elapsed_time": "0:18:54", "remaining_time": "1:28:07"} -{"current_steps": 289, "total_steps": 1630, "loss": 0.3416, "lr": 4.624647092067226e-06, "epoch": 1.7730061349693251, "percentage": 17.73, "elapsed_time": "0:18:57", "remaining_time": "1:27:55"} -{"current_steps": 290, "total_steps": 1630, "loss": 0.3889, "lr": 4.622103803879702e-06, "epoch": 1.7791411042944785, "percentage": 17.79, "elapsed_time": "0:18:59", "remaining_time": "1:27:45"} -{"current_steps": 291, "total_steps": 1630, "loss": 0.611, "lr": 4.619552632701263e-06, "epoch": 1.7852760736196318, "percentage": 17.85, "elapsed_time": "0:19:00", "remaining_time": "1:27:30"} -{"current_steps": 292, "total_steps": 1630, "loss": 0.7219, "lr": 4.61699358800876e-06, "epoch": 1.7914110429447851, "percentage": 17.91, "elapsed_time": "0:19:02", "remaining_time": "1:27:16"} -{"current_steps": 293, "total_steps": 1630, "loss": 0.6402, "lr": 4.614426679308291e-06, "epoch": 1.7975460122699385, "percentage": 17.98, "elapsed_time": "0:19:05", "remaining_time": "1:27:08"} -{"current_steps": 294, "total_steps": 1630, "loss": 0.509, "lr": 4.611851916135166e-06, "epoch": 1.803680981595092, "percentage": 18.04, "elapsed_time": "0:19:07", "remaining_time": "1:26:55"} -{"current_steps": 295, "total_steps": 1630, "loss": 0.6167, "lr": 4.609269308053872e-06, "epoch": 1.8098159509202454, "percentage": 18.1, "elapsed_time": "0:19:12", "remaining_time": "1:26:55"} -{"current_steps": 296, "total_steps": 1630, "loss": 0.8039, "lr": 4.606678864658039e-06, "epoch": 1.8159509202453987, "percentage": 18.16, "elapsed_time": "0:19:15", "remaining_time": "1:26:48"} -{"current_steps": 297, "total_steps": 1630, "loss": 0.5754, "lr": 4.604080595570399e-06, "epoch": 1.8220858895705523, "percentage": 18.22, "elapsed_time": "0:19:18", "remaining_time": "1:26:37"} -{"current_steps": 298, "total_steps": 1630, "loss": 0.4432, "lr": 4.601474510442759e-06, "epoch": 1.8282208588957056, "percentage": 18.28, "elapsed_time": "0:19:20", "remaining_time": "1:26:25"} -{"current_steps": 299, "total_steps": 1630, "loss": 0.6541, "lr": 4.598860618955957e-06, "epoch": 1.834355828220859, "percentage": 18.34, "elapsed_time": "0:19:25", "remaining_time": "1:26:28"} -{"current_steps": 300, "total_steps": 1630, "loss": 0.5824, "lr": 4.596238930819832e-06, "epoch": 1.8404907975460123, "percentage": 18.4, "elapsed_time": "0:19:30", "remaining_time": "1:26:30"} -{"current_steps": 301, "total_steps": 1630, "loss": 0.6976, "lr": 4.5936094557731815e-06, "epoch": 1.8466257668711656, "percentage": 18.47, "elapsed_time": "0:19:32", "remaining_time": "1:26:15"} -{"current_steps": 302, "total_steps": 1630, "loss": 0.7105, "lr": 4.590972203583732e-06, "epoch": 1.852760736196319, "percentage": 18.53, "elapsed_time": "0:19:35", "remaining_time": "1:26:09"} -{"current_steps": 303, "total_steps": 1630, "loss": 0.7446, "lr": 4.588327184048099e-06, "epoch": 1.8588957055214723, "percentage": 18.59, "elapsed_time": "0:19:38", "remaining_time": "1:26:01"} -{"current_steps": 304, "total_steps": 1630, "loss": 0.4926, "lr": 4.585674406991752e-06, "epoch": 1.8650306748466257, "percentage": 18.65, "elapsed_time": "0:19:42", "remaining_time": "1:25:59"} -{"current_steps": 305, "total_steps": 1630, "loss": 0.7368, "lr": 4.5830138822689755e-06, "epoch": 1.871165644171779, "percentage": 18.71, "elapsed_time": "0:19:48", "remaining_time": "1:26:02"} -{"current_steps": 306, "total_steps": 1630, "loss": 0.4678, "lr": 4.5803456197628374e-06, "epoch": 1.8773006134969326, "percentage": 18.77, "elapsed_time": "0:19:51", "remaining_time": "1:25:53"} -{"current_steps": 307, "total_steps": 1630, "loss": 0.4241, "lr": 4.577669629385145e-06, "epoch": 1.883435582822086, "percentage": 18.83, "elapsed_time": "0:19:56", "remaining_time": "1:25:58"} -{"current_steps": 308, "total_steps": 1630, "loss": 0.5327, "lr": 4.574985921076418e-06, "epoch": 1.8895705521472392, "percentage": 18.9, "elapsed_time": "0:19:59", "remaining_time": "1:25:46"} -{"current_steps": 309, "total_steps": 1630, "loss": 0.7504, "lr": 4.572294504805841e-06, "epoch": 1.8957055214723928, "percentage": 18.96, "elapsed_time": "0:20:00", "remaining_time": "1:25:33"} -{"current_steps": 310, "total_steps": 1630, "loss": 0.5194, "lr": 4.569595390571232e-06, "epoch": 1.9018404907975461, "percentage": 19.02, "elapsed_time": "0:20:02", "remaining_time": "1:25:20"} -{"current_steps": 311, "total_steps": 1630, "loss": 0.6862, "lr": 4.566888588399007e-06, "epoch": 1.9079754601226995, "percentage": 19.08, "elapsed_time": "0:20:04", "remaining_time": "1:25:06"} -{"current_steps": 312, "total_steps": 1630, "loss": 0.6867, "lr": 4.564174108344139e-06, "epoch": 1.9141104294478528, "percentage": 19.14, "elapsed_time": "0:20:07", "remaining_time": "1:25:00"} -{"current_steps": 313, "total_steps": 1630, "loss": 0.6942, "lr": 4.561451960490123e-06, "epoch": 1.9202453987730062, "percentage": 19.2, "elapsed_time": "0:20:11", "remaining_time": "1:24:56"} -{"current_steps": 314, "total_steps": 1630, "loss": 0.6346, "lr": 4.558722154948937e-06, "epoch": 1.9263803680981595, "percentage": 19.26, "elapsed_time": "0:20:12", "remaining_time": "1:24:42"} -{"current_steps": 315, "total_steps": 1630, "loss": 0.464, "lr": 4.5559847018610034e-06, "epoch": 1.9325153374233128, "percentage": 19.33, "elapsed_time": "0:20:14", "remaining_time": "1:24:31"} -{"current_steps": 316, "total_steps": 1630, "loss": 0.6334, "lr": 4.553239611395156e-06, "epoch": 1.9386503067484662, "percentage": 19.39, "elapsed_time": "0:20:16", "remaining_time": "1:24:18"} -{"current_steps": 317, "total_steps": 1630, "loss": 0.4227, "lr": 4.550486893748596e-06, "epoch": 1.9447852760736195, "percentage": 19.45, "elapsed_time": "0:20:18", "remaining_time": "1:24:07"} -{"current_steps": 318, "total_steps": 1630, "loss": 0.3719, "lr": 4.547726559146862e-06, "epoch": 1.9509202453987728, "percentage": 19.51, "elapsed_time": "0:20:20", "remaining_time": "1:23:55"} -{"current_steps": 319, "total_steps": 1630, "loss": 0.3331, "lr": 4.544958617843782e-06, "epoch": 1.9570552147239264, "percentage": 19.57, "elapsed_time": "0:20:23", "remaining_time": "1:23:47"} -{"current_steps": 320, "total_steps": 1630, "loss": 0.6931, "lr": 4.542183080121444e-06, "epoch": 1.9631901840490797, "percentage": 19.63, "elapsed_time": "0:20:24", "remaining_time": "1:23:32"} -{"current_steps": 321, "total_steps": 1630, "loss": 0.6578, "lr": 4.539399956290152e-06, "epoch": 1.969325153374233, "percentage": 19.69, "elapsed_time": "0:20:26", "remaining_time": "1:23:21"} -{"current_steps": 322, "total_steps": 1630, "loss": 0.5748, "lr": 4.536609256688396e-06, "epoch": 1.9754601226993866, "percentage": 19.75, "elapsed_time": "0:20:29", "remaining_time": "1:23:15"} -{"current_steps": 323, "total_steps": 1630, "loss": 0.5249, "lr": 4.533810991682799e-06, "epoch": 1.98159509202454, "percentage": 19.82, "elapsed_time": "0:20:30", "remaining_time": "1:22:59"} -{"current_steps": 324, "total_steps": 1630, "loss": 0.3065, "lr": 4.531005171668093e-06, "epoch": 1.9877300613496933, "percentage": 19.88, "elapsed_time": "0:20:31", "remaining_time": "1:22:43"} -{"current_steps": 325, "total_steps": 1630, "loss": 0.5523, "lr": 4.528191807067074e-06, "epoch": 1.9938650306748467, "percentage": 19.94, "elapsed_time": "0:20:34", "remaining_time": "1:22:35"} -{"current_steps": 326, "total_steps": 1630, "loss": 0.4157, "lr": 4.525370908330564e-06, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "0:20:36", "remaining_time": "1:22:25"} -{"current_steps": 327, "total_steps": 1630, "loss": 0.4243, "lr": 4.522542485937369e-06, "epoch": 2.0061349693251533, "percentage": 20.06, "elapsed_time": "0:24:48", "remaining_time": "1:38:51"} -{"current_steps": 328, "total_steps": 1630, "loss": 0.4137, "lr": 4.519706550394248e-06, "epoch": 2.0122699386503067, "percentage": 20.12, "elapsed_time": "0:24:50", "remaining_time": "1:38:37"} -{"current_steps": 329, "total_steps": 1630, "loss": 0.5389, "lr": 4.516863112235864e-06, "epoch": 2.01840490797546, "percentage": 20.18, "elapsed_time": "0:24:52", "remaining_time": "1:38:23"} -{"current_steps": 330, "total_steps": 1630, "loss": 0.285, "lr": 4.514012182024756e-06, "epoch": 2.0245398773006134, "percentage": 20.25, "elapsed_time": "0:24:55", "remaining_time": "1:38:11"} -{"current_steps": 331, "total_steps": 1630, "loss": 0.4877, "lr": 4.511153770351288e-06, "epoch": 2.0306748466257667, "percentage": 20.31, "elapsed_time": "0:24:58", "remaining_time": "1:38:01"} -{"current_steps": 332, "total_steps": 1630, "loss": 0.5168, "lr": 4.508287887833619e-06, "epoch": 2.03680981595092, "percentage": 20.37, "elapsed_time": "0:25:01", "remaining_time": "1:37:50"} -{"current_steps": 333, "total_steps": 1630, "loss": 0.4791, "lr": 4.505414545117658e-06, "epoch": 2.042944785276074, "percentage": 20.43, "elapsed_time": "0:25:02", "remaining_time": "1:37:34"} -{"current_steps": 334, "total_steps": 1630, "loss": 0.3014, "lr": 4.502533752877028e-06, "epoch": 2.049079754601227, "percentage": 20.49, "elapsed_time": "0:25:04", "remaining_time": "1:37:18"} -{"current_steps": 335, "total_steps": 1630, "loss": 0.4313, "lr": 4.499645521813024e-06, "epoch": 2.0552147239263805, "percentage": 20.55, "elapsed_time": "0:25:09", "remaining_time": "1:37:13"} -{"current_steps": 336, "total_steps": 1630, "loss": 0.4807, "lr": 4.496749862654574e-06, "epoch": 2.061349693251534, "percentage": 20.61, "elapsed_time": "0:25:10", "remaining_time": "1:36:56"} -{"current_steps": 337, "total_steps": 1630, "loss": 0.4002, "lr": 4.4938467861582e-06, "epoch": 2.067484662576687, "percentage": 20.67, "elapsed_time": "0:25:11", "remaining_time": "1:36:38"} -{"current_steps": 338, "total_steps": 1630, "loss": 0.618, "lr": 4.490936303107975e-06, "epoch": 2.0736196319018405, "percentage": 20.74, "elapsed_time": "0:25:13", "remaining_time": "1:36:24"} -{"current_steps": 339, "total_steps": 1630, "loss": 0.4203, "lr": 4.488018424315488e-06, "epoch": 2.079754601226994, "percentage": 20.8, "elapsed_time": "0:25:15", "remaining_time": "1:36:13"} -{"current_steps": 340, "total_steps": 1630, "loss": 0.3618, "lr": 4.4850931606198e-06, "epoch": 2.085889570552147, "percentage": 20.86, "elapsed_time": "0:25:17", "remaining_time": "1:35:58"} -{"current_steps": 341, "total_steps": 1630, "loss": 0.4571, "lr": 4.482160522887404e-06, "epoch": 2.0920245398773005, "percentage": 20.92, "elapsed_time": "0:25:24", "remaining_time": "1:36:01"} -{"current_steps": 342, "total_steps": 1630, "loss": 0.2674, "lr": 4.479220522012185e-06, "epoch": 2.098159509202454, "percentage": 20.98, "elapsed_time": "0:25:26", "remaining_time": "1:35:48"} -{"current_steps": 343, "total_steps": 1630, "loss": 0.5479, "lr": 4.476273168915382e-06, "epoch": 2.104294478527607, "percentage": 21.04, "elapsed_time": "0:25:28", "remaining_time": "1:35:35"} -{"current_steps": 344, "total_steps": 1630, "loss": 0.3654, "lr": 4.473318474545544e-06, "epoch": 2.1104294478527605, "percentage": 21.1, "elapsed_time": "0:25:30", "remaining_time": "1:35:19"} -{"current_steps": 345, "total_steps": 1630, "loss": 0.2704, "lr": 4.470356449878489e-06, "epoch": 2.116564417177914, "percentage": 21.17, "elapsed_time": "0:25:32", "remaining_time": "1:35:08"} -{"current_steps": 346, "total_steps": 1630, "loss": 0.3029, "lr": 4.467387105917269e-06, "epoch": 2.1226993865030677, "percentage": 21.23, "elapsed_time": "0:25:33", "remaining_time": "1:34:51"} -{"current_steps": 347, "total_steps": 1630, "loss": 0.6536, "lr": 4.464410453692122e-06, "epoch": 2.128834355828221, "percentage": 21.29, "elapsed_time": "0:25:39", "remaining_time": "1:34:52"} -{"current_steps": 348, "total_steps": 1630, "loss": 0.3806, "lr": 4.461426504260434e-06, "epoch": 2.1349693251533743, "percentage": 21.35, "elapsed_time": "0:25:40", "remaining_time": "1:34:36"} -{"current_steps": 349, "total_steps": 1630, "loss": 0.4019, "lr": 4.458435268706699e-06, "epoch": 2.1411042944785277, "percentage": 21.41, "elapsed_time": "0:25:44", "remaining_time": "1:34:30"} -{"current_steps": 350, "total_steps": 1630, "loss": 0.2348, "lr": 4.455436758142477e-06, "epoch": 2.147239263803681, "percentage": 21.47, "elapsed_time": "0:25:48", "remaining_time": "1:34:23"} -{"current_steps": 351, "total_steps": 1630, "loss": 0.505, "lr": 4.452430983706351e-06, "epoch": 2.1533742331288344, "percentage": 21.53, "elapsed_time": "0:25:54", "remaining_time": "1:34:23"} -{"current_steps": 352, "total_steps": 1630, "loss": 0.399, "lr": 4.44941795656389e-06, "epoch": 2.1595092024539877, "percentage": 21.6, "elapsed_time": "0:26:00", "remaining_time": "1:34:24"} -{"current_steps": 353, "total_steps": 1630, "loss": 0.5664, "lr": 4.446397687907601e-06, "epoch": 2.165644171779141, "percentage": 21.66, "elapsed_time": "0:26:01", "remaining_time": "1:34:09"} -{"current_steps": 354, "total_steps": 1630, "loss": 0.2128, "lr": 4.4433701889568935e-06, "epoch": 2.1717791411042944, "percentage": 21.72, "elapsed_time": "0:26:03", "remaining_time": "1:33:54"} -{"current_steps": 355, "total_steps": 1630, "loss": 0.3138, "lr": 4.440335470958035e-06, "epoch": 2.1779141104294477, "percentage": 21.78, "elapsed_time": "0:26:04", "remaining_time": "1:33:39"} -{"current_steps": 356, "total_steps": 1630, "loss": 0.349, "lr": 4.437293545184111e-06, "epoch": 2.184049079754601, "percentage": 21.84, "elapsed_time": "0:26:06", "remaining_time": "1:33:25"} -{"current_steps": 357, "total_steps": 1630, "loss": 0.343, "lr": 4.434244422934976e-06, "epoch": 2.190184049079755, "percentage": 21.9, "elapsed_time": "0:26:08", "remaining_time": "1:33:12"} -{"current_steps": 358, "total_steps": 1630, "loss": 0.5656, "lr": 4.431188115537226e-06, "epoch": 2.196319018404908, "percentage": 21.96, "elapsed_time": "0:26:12", "remaining_time": "1:33:08"} -{"current_steps": 359, "total_steps": 1630, "loss": 0.2335, "lr": 4.428124634344141e-06, "epoch": 2.2024539877300615, "percentage": 22.02, "elapsed_time": "0:26:13", "remaining_time": "1:32:51"} -{"current_steps": 360, "total_steps": 1630, "loss": 0.2173, "lr": 4.425053990735653e-06, "epoch": 2.208588957055215, "percentage": 22.09, "elapsed_time": "0:26:15", "remaining_time": "1:32:38"} -{"current_steps": 361, "total_steps": 1630, "loss": 0.5071, "lr": 4.421976196118297e-06, "epoch": 2.214723926380368, "percentage": 22.15, "elapsed_time": "0:26:17", "remaining_time": "1:32:25"} -{"current_steps": 362, "total_steps": 1630, "loss": 0.384, "lr": 4.4188912619251765e-06, "epoch": 2.2208588957055215, "percentage": 22.21, "elapsed_time": "0:26:19", "remaining_time": "1:32:13"} -{"current_steps": 363, "total_steps": 1630, "loss": 0.3133, "lr": 4.415799199615912e-06, "epoch": 2.226993865030675, "percentage": 22.27, "elapsed_time": "0:26:21", "remaining_time": "1:31:59"} -{"current_steps": 364, "total_steps": 1630, "loss": 0.3847, "lr": 4.4127000206766055e-06, "epoch": 2.233128834355828, "percentage": 22.33, "elapsed_time": "0:26:23", "remaining_time": "1:31:48"} -{"current_steps": 365, "total_steps": 1630, "loss": 0.3539, "lr": 4.409593736619795e-06, "epoch": 2.2392638036809815, "percentage": 22.39, "elapsed_time": "0:26:27", "remaining_time": "1:31:43"} -{"current_steps": 366, "total_steps": 1630, "loss": 0.3664, "lr": 4.40648035898441e-06, "epoch": 2.245398773006135, "percentage": 22.45, "elapsed_time": "0:26:29", "remaining_time": "1:31:29"} -{"current_steps": 367, "total_steps": 1630, "loss": 0.4606, "lr": 4.403359899335732e-06, "epoch": 2.2515337423312882, "percentage": 22.52, "elapsed_time": "0:26:33", "remaining_time": "1:31:23"} -{"current_steps": 368, "total_steps": 1630, "loss": 0.2931, "lr": 4.400232369265351e-06, "epoch": 2.2576687116564416, "percentage": 22.58, "elapsed_time": "0:26:35", "remaining_time": "1:31:12"} -{"current_steps": 369, "total_steps": 1630, "loss": 0.3393, "lr": 4.39709778039112e-06, "epoch": 2.263803680981595, "percentage": 22.64, "elapsed_time": "0:26:37", "remaining_time": "1:31:00"} -{"current_steps": 370, "total_steps": 1630, "loss": 0.65, "lr": 4.393956144357113e-06, "epoch": 2.2699386503067487, "percentage": 22.7, "elapsed_time": "0:26:40", "remaining_time": "1:30:51"} -{"current_steps": 371, "total_steps": 1630, "loss": 0.372, "lr": 4.390807472833585e-06, "epoch": 2.276073619631902, "percentage": 22.76, "elapsed_time": "0:26:43", "remaining_time": "1:30:43"} -{"current_steps": 372, "total_steps": 1630, "loss": 0.2802, "lr": 4.3876517775169216e-06, "epoch": 2.2822085889570554, "percentage": 22.82, "elapsed_time": "0:26:47", "remaining_time": "1:30:37"} -{"current_steps": 373, "total_steps": 1630, "loss": 0.1964, "lr": 4.384489070129604e-06, "epoch": 2.2883435582822087, "percentage": 22.88, "elapsed_time": "0:26:49", "remaining_time": "1:30:22"} -{"current_steps": 374, "total_steps": 1630, "loss": 0.4272, "lr": 4.381319362420158e-06, "epoch": 2.294478527607362, "percentage": 22.94, "elapsed_time": "0:26:50", "remaining_time": "1:30:09"} -{"current_steps": 375, "total_steps": 1630, "loss": 0.4513, "lr": 4.378142666163114e-06, "epoch": 2.3006134969325154, "percentage": 23.01, "elapsed_time": "0:26:54", "remaining_time": "1:30:02"} -{"current_steps": 376, "total_steps": 1630, "loss": 0.6087, "lr": 4.374958993158965e-06, "epoch": 2.3067484662576687, "percentage": 23.07, "elapsed_time": "0:26:58", "remaining_time": "1:29:56"} -{"current_steps": 377, "total_steps": 1630, "loss": 0.2206, "lr": 4.371768355234116e-06, "epoch": 2.312883435582822, "percentage": 23.13, "elapsed_time": "0:27:02", "remaining_time": "1:29:53"} -{"current_steps": 378, "total_steps": 1630, "loss": 0.6055, "lr": 4.368570764240852e-06, "epoch": 2.3190184049079754, "percentage": 23.19, "elapsed_time": "0:27:04", "remaining_time": "1:29:41"} -{"current_steps": 379, "total_steps": 1630, "loss": 0.6286, "lr": 4.365366232057279e-06, "epoch": 2.3251533742331287, "percentage": 23.25, "elapsed_time": "0:27:07", "remaining_time": "1:29:32"} -{"current_steps": 380, "total_steps": 1630, "loss": 0.2622, "lr": 4.3621547705872915e-06, "epoch": 2.331288343558282, "percentage": 23.31, "elapsed_time": "0:27:11", "remaining_time": "1:29:26"} -{"current_steps": 381, "total_steps": 1630, "loss": 0.3439, "lr": 4.358936391760524e-06, "epoch": 2.3374233128834354, "percentage": 23.37, "elapsed_time": "0:27:13", "remaining_time": "1:29:13"} -{"current_steps": 382, "total_steps": 1630, "loss": 0.3677, "lr": 4.355711107532305e-06, "epoch": 2.3435582822085887, "percentage": 23.44, "elapsed_time": "0:27:16", "remaining_time": "1:29:05"} -{"current_steps": 383, "total_steps": 1630, "loss": 0.3068, "lr": 4.3524789298836175e-06, "epoch": 2.3496932515337425, "percentage": 23.5, "elapsed_time": "0:27:18", "remaining_time": "1:28:54"} -{"current_steps": 384, "total_steps": 1630, "loss": 0.3737, "lr": 4.349239870821049e-06, "epoch": 2.355828220858896, "percentage": 23.56, "elapsed_time": "0:27:20", "remaining_time": "1:28:44"} -{"current_steps": 385, "total_steps": 1630, "loss": 0.2837, "lr": 4.345993942376752e-06, "epoch": 2.361963190184049, "percentage": 23.62, "elapsed_time": "0:27:22", "remaining_time": "1:28:32"} -{"current_steps": 386, "total_steps": 1630, "loss": 0.6162, "lr": 4.342741156608392e-06, "epoch": 2.3680981595092025, "percentage": 23.68, "elapsed_time": "0:27:25", "remaining_time": "1:28:21"} -{"current_steps": 387, "total_steps": 1630, "loss": 0.2986, "lr": 4.3394815255991135e-06, "epoch": 2.374233128834356, "percentage": 23.74, "elapsed_time": "0:27:29", "remaining_time": "1:28:18"} -{"current_steps": 388, "total_steps": 1630, "loss": 0.5383, "lr": 4.336215061457485e-06, "epoch": 2.3803680981595092, "percentage": 23.8, "elapsed_time": "0:27:33", "remaining_time": "1:28:14"} -{"current_steps": 389, "total_steps": 1630, "loss": 0.4179, "lr": 4.332941776317458e-06, "epoch": 2.3865030674846626, "percentage": 23.87, "elapsed_time": "0:27:35", "remaining_time": "1:28:01"} -{"current_steps": 390, "total_steps": 1630, "loss": 0.3938, "lr": 4.329661682338325e-06, "epoch": 2.392638036809816, "percentage": 23.93, "elapsed_time": "0:27:40", "remaining_time": "1:27:58"} -{"current_steps": 391, "total_steps": 1630, "loss": 0.3349, "lr": 4.32637479170467e-06, "epoch": 2.3987730061349692, "percentage": 23.99, "elapsed_time": "0:27:41", "remaining_time": "1:27:43"} -{"current_steps": 392, "total_steps": 1630, "loss": 0.3336, "lr": 4.323081116626322e-06, "epoch": 2.4049079754601226, "percentage": 24.05, "elapsed_time": "0:27:42", "remaining_time": "1:27:30"} -{"current_steps": 393, "total_steps": 1630, "loss": 0.2983, "lr": 4.319780669338316e-06, "epoch": 2.411042944785276, "percentage": 24.11, "elapsed_time": "0:27:44", "remaining_time": "1:27:20"} -{"current_steps": 394, "total_steps": 1630, "loss": 0.8401, "lr": 4.31647346210084e-06, "epoch": 2.4171779141104293, "percentage": 24.17, "elapsed_time": "0:27:46", "remaining_time": "1:27:06"} -{"current_steps": 395, "total_steps": 1630, "loss": 0.2583, "lr": 4.313159507199197e-06, "epoch": 2.4233128834355826, "percentage": 24.23, "elapsed_time": "0:27:47", "remaining_time": "1:26:54"} -{"current_steps": 396, "total_steps": 1630, "loss": 0.2861, "lr": 4.309838816943755e-06, "epoch": 2.4294478527607364, "percentage": 24.29, "elapsed_time": "0:27:50", "remaining_time": "1:26:46"} -{"current_steps": 397, "total_steps": 1630, "loss": 0.2956, "lr": 4.306511403669897e-06, "epoch": 2.4355828220858897, "percentage": 24.36, "elapsed_time": "0:27:54", "remaining_time": "1:26:39"} -{"current_steps": 398, "total_steps": 1630, "loss": 0.5378, "lr": 4.303177279737988e-06, "epoch": 2.441717791411043, "percentage": 24.42, "elapsed_time": "0:27:55", "remaining_time": "1:26:25"} -{"current_steps": 399, "total_steps": 1630, "loss": 0.3423, "lr": 4.299836457533313e-06, "epoch": 2.4478527607361964, "percentage": 24.48, "elapsed_time": "0:27:58", "remaining_time": "1:26:19"} -{"current_steps": 400, "total_steps": 1630, "loss": 0.5608, "lr": 4.296488949466046e-06, "epoch": 2.4539877300613497, "percentage": 24.54, "elapsed_time": "0:28:01", "remaining_time": "1:26:12"} -{"current_steps": 401, "total_steps": 1630, "loss": 0.3214, "lr": 4.293134767971193e-06, "epoch": 2.460122699386503, "percentage": 24.6, "elapsed_time": "0:28:03", "remaining_time": "1:26:00"} -{"current_steps": 402, "total_steps": 1630, "loss": 0.5117, "lr": 4.28977392550855e-06, "epoch": 2.4662576687116564, "percentage": 24.66, "elapsed_time": "0:28:06", "remaining_time": "1:25:53"} -{"current_steps": 403, "total_steps": 1630, "loss": 0.2666, "lr": 4.286406434562659e-06, "epoch": 2.4723926380368098, "percentage": 24.72, "elapsed_time": "0:28:07", "remaining_time": "1:25:38"} -{"current_steps": 404, "total_steps": 1630, "loss": 0.2878, "lr": 4.283032307642756e-06, "epoch": 2.478527607361963, "percentage": 24.79, "elapsed_time": "0:28:11", "remaining_time": "1:25:34"} -{"current_steps": 405, "total_steps": 1630, "loss": 0.2619, "lr": 4.2796515572827305e-06, "epoch": 2.4846625766871164, "percentage": 24.85, "elapsed_time": "0:28:13", "remaining_time": "1:25:22"} -{"current_steps": 406, "total_steps": 1630, "loss": 0.1735, "lr": 4.276264196041074e-06, "epoch": 2.4907975460122698, "percentage": 24.91, "elapsed_time": "0:28:15", "remaining_time": "1:25:11"} -{"current_steps": 407, "total_steps": 1630, "loss": 0.4741, "lr": 4.2728702365008356e-06, "epoch": 2.4969325153374236, "percentage": 24.97, "elapsed_time": "0:28:17", "remaining_time": "1:25:00"} -{"current_steps": 408, "total_steps": 1630, "loss": 0.3713, "lr": 4.269469691269577e-06, "epoch": 2.5030674846625764, "percentage": 25.03, "elapsed_time": "0:28:20", "remaining_time": "1:24:53"} -{"current_steps": 409, "total_steps": 1630, "loss": 0.5189, "lr": 4.266062572979323e-06, "epoch": 2.5092024539877302, "percentage": 25.09, "elapsed_time": "0:28:21", "remaining_time": "1:24:40"} -{"current_steps": 410, "total_steps": 1630, "loss": 0.2461, "lr": 4.262648894286515e-06, "epoch": 2.5153374233128836, "percentage": 25.15, "elapsed_time": "0:28:23", "remaining_time": "1:24:30"} -{"current_steps": 411, "total_steps": 1630, "loss": 0.3013, "lr": 4.259228667871963e-06, "epoch": 2.521472392638037, "percentage": 25.21, "elapsed_time": "0:28:27", "remaining_time": "1:24:23"} -{"current_steps": 412, "total_steps": 1630, "loss": 0.2784, "lr": 4.255801906440803e-06, "epoch": 2.5276073619631902, "percentage": 25.28, "elapsed_time": "0:28:30", "remaining_time": "1:24:16"} -{"current_steps": 413, "total_steps": 1630, "loss": 0.457, "lr": 4.252368622722443e-06, "epoch": 2.5337423312883436, "percentage": 25.34, "elapsed_time": "0:28:35", "remaining_time": "1:24:16"} -{"current_steps": 414, "total_steps": 1630, "loss": 0.487, "lr": 4.248928829470522e-06, "epoch": 2.539877300613497, "percentage": 25.4, "elapsed_time": "0:28:43", "remaining_time": "1:24:21"} -{"current_steps": 415, "total_steps": 1630, "loss": 0.6118, "lr": 4.245482539462861e-06, "epoch": 2.5460122699386503, "percentage": 25.46, "elapsed_time": "0:28:50", "remaining_time": "1:24:25"} -{"current_steps": 416, "total_steps": 1630, "loss": 0.6131, "lr": 4.242029765501411e-06, "epoch": 2.5521472392638036, "percentage": 25.52, "elapsed_time": "0:28:53", "remaining_time": "1:24:19"} -{"current_steps": 417, "total_steps": 1630, "loss": 0.4209, "lr": 4.2385705204122104e-06, "epoch": 2.558282208588957, "percentage": 25.58, "elapsed_time": "0:28:59", "remaining_time": "1:24:19"} -{"current_steps": 418, "total_steps": 1630, "loss": 0.5375, "lr": 4.235104817045338e-06, "epoch": 2.5644171779141103, "percentage": 25.64, "elapsed_time": "0:29:01", "remaining_time": "1:24:09"} -{"current_steps": 419, "total_steps": 1630, "loss": 0.4682, "lr": 4.231632668274861e-06, "epoch": 2.5705521472392636, "percentage": 25.71, "elapsed_time": "0:29:05", "remaining_time": "1:24:06"} -{"current_steps": 420, "total_steps": 1630, "loss": 0.2522, "lr": 4.22815408699879e-06, "epoch": 2.5766871165644174, "percentage": 25.77, "elapsed_time": "0:29:08", "remaining_time": "1:23:58"} -{"current_steps": 421, "total_steps": 1630, "loss": 0.4776, "lr": 4.22466908613903e-06, "epoch": 2.5828220858895703, "percentage": 25.83, "elapsed_time": "0:29:12", "remaining_time": "1:23:51"} -{"current_steps": 422, "total_steps": 1630, "loss": 0.3067, "lr": 4.221177678641333e-06, "epoch": 2.588957055214724, "percentage": 25.89, "elapsed_time": "0:29:13", "remaining_time": "1:23:40"} -{"current_steps": 423, "total_steps": 1630, "loss": 0.2402, "lr": 4.217679877475251e-06, "epoch": 2.5950920245398774, "percentage": 25.95, "elapsed_time": "0:29:16", "remaining_time": "1:23:31"} -{"current_steps": 424, "total_steps": 1630, "loss": 0.2608, "lr": 4.214175695634084e-06, "epoch": 2.6012269938650308, "percentage": 26.01, "elapsed_time": "0:29:17", "remaining_time": "1:23:18"} -{"current_steps": 425, "total_steps": 1630, "loss": 0.2801, "lr": 4.210665146134838e-06, "epoch": 2.607361963190184, "percentage": 26.07, "elapsed_time": "0:29:23", "remaining_time": "1:23:19"} -{"current_steps": 426, "total_steps": 1630, "loss": 0.2027, "lr": 4.20714824201817e-06, "epoch": 2.6134969325153374, "percentage": 26.13, "elapsed_time": "0:29:25", "remaining_time": "1:23:10"} -{"current_steps": 427, "total_steps": 1630, "loss": 0.4253, "lr": 4.203624996348343e-06, "epoch": 2.6196319018404908, "percentage": 26.2, "elapsed_time": "0:29:27", "remaining_time": "1:22:59"} -{"current_steps": 428, "total_steps": 1630, "loss": 0.3014, "lr": 4.200095422213177e-06, "epoch": 2.625766871165644, "percentage": 26.26, "elapsed_time": "0:29:28", "remaining_time": "1:22:47"} -{"current_steps": 429, "total_steps": 1630, "loss": 0.6526, "lr": 4.196559532724004e-06, "epoch": 2.6319018404907975, "percentage": 26.32, "elapsed_time": "0:29:35", "remaining_time": "1:22:49"} -{"current_steps": 430, "total_steps": 1630, "loss": 0.4487, "lr": 4.193017341015608e-06, "epoch": 2.638036809815951, "percentage": 26.38, "elapsed_time": "0:29:37", "remaining_time": "1:22:39"} -{"current_steps": 431, "total_steps": 1630, "loss": 0.5386, "lr": 4.189468860246192e-06, "epoch": 2.644171779141104, "percentage": 26.44, "elapsed_time": "0:29:43", "remaining_time": "1:22:41"} -{"current_steps": 432, "total_steps": 1630, "loss": 0.3034, "lr": 4.185914103597316e-06, "epoch": 2.6503067484662575, "percentage": 26.5, "elapsed_time": "0:29:47", "remaining_time": "1:22:37"} -{"current_steps": 433, "total_steps": 1630, "loss": 0.5862, "lr": 4.182353084273855e-06, "epoch": 2.6564417177914113, "percentage": 26.56, "elapsed_time": "0:29:51", "remaining_time": "1:22:33"} -{"current_steps": 434, "total_steps": 1630, "loss": 0.3954, "lr": 4.178785815503946e-06, "epoch": 2.662576687116564, "percentage": 26.63, "elapsed_time": "0:29:53", "remaining_time": "1:22:21"} -{"current_steps": 435, "total_steps": 1630, "loss": 0.4367, "lr": 4.1752123105389444e-06, "epoch": 2.668711656441718, "percentage": 26.69, "elapsed_time": "0:29:58", "remaining_time": "1:22:20"} -{"current_steps": 436, "total_steps": 1630, "loss": 0.2997, "lr": 4.171632582653368e-06, "epoch": 2.6748466257668713, "percentage": 26.75, "elapsed_time": "0:30:00", "remaining_time": "1:22:09"} -{"current_steps": 437, "total_steps": 1630, "loss": 0.3354, "lr": 4.168046645144851e-06, "epoch": 2.6809815950920246, "percentage": 26.81, "elapsed_time": "0:30:01", "remaining_time": "1:21:57"} -{"current_steps": 438, "total_steps": 1630, "loss": 0.5538, "lr": 4.164454511334098e-06, "epoch": 2.687116564417178, "percentage": 26.87, "elapsed_time": "0:30:04", "remaining_time": "1:21:50"} -{"current_steps": 439, "total_steps": 1630, "loss": 0.5731, "lr": 4.160856194564828e-06, "epoch": 2.6932515337423313, "percentage": 26.93, "elapsed_time": "0:30:06", "remaining_time": "1:21:41"} -{"current_steps": 440, "total_steps": 1630, "loss": 0.4429, "lr": 4.157251708203728e-06, "epoch": 2.6993865030674846, "percentage": 26.99, "elapsed_time": "0:30:08", "remaining_time": "1:21:32"} -{"current_steps": 441, "total_steps": 1630, "loss": 0.6361, "lr": 4.153641065640402e-06, "epoch": 2.705521472392638, "percentage": 27.06, "elapsed_time": "0:30:10", "remaining_time": "1:21:21"} -{"current_steps": 442, "total_steps": 1630, "loss": 0.2418, "lr": 4.150024280287327e-06, "epoch": 2.7116564417177913, "percentage": 27.12, "elapsed_time": "0:30:11", "remaining_time": "1:21:08"} -{"current_steps": 443, "total_steps": 1630, "loss": 0.2549, "lr": 4.146401365579795e-06, "epoch": 2.7177914110429446, "percentage": 27.18, "elapsed_time": "0:30:13", "remaining_time": "1:20:59"} -{"current_steps": 444, "total_steps": 1630, "loss": 0.3822, "lr": 4.142772334975868e-06, "epoch": 2.7239263803680984, "percentage": 27.24, "elapsed_time": "0:30:16", "remaining_time": "1:20:52"} -{"current_steps": 445, "total_steps": 1630, "loss": 0.3107, "lr": 4.139137201956324e-06, "epoch": 2.7300613496932513, "percentage": 27.3, "elapsed_time": "0:30:18", "remaining_time": "1:20:41"} -{"current_steps": 446, "total_steps": 1630, "loss": 0.2829, "lr": 4.1354959800246155e-06, "epoch": 2.736196319018405, "percentage": 27.36, "elapsed_time": "0:30:20", "remaining_time": "1:20:31"} -{"current_steps": 447, "total_steps": 1630, "loss": 0.3045, "lr": 4.131848682706807e-06, "epoch": 2.7423312883435584, "percentage": 27.42, "elapsed_time": "0:30:21", "remaining_time": "1:20:21"} -{"current_steps": 448, "total_steps": 1630, "loss": 0.316, "lr": 4.128195323551536e-06, "epoch": 2.7484662576687118, "percentage": 27.48, "elapsed_time": "0:30:23", "remaining_time": "1:20:11"} -{"current_steps": 449, "total_steps": 1630, "loss": 0.5278, "lr": 4.1245359161299555e-06, "epoch": 2.754601226993865, "percentage": 27.55, "elapsed_time": "0:30:27", "remaining_time": "1:20:06"} -{"current_steps": 450, "total_steps": 1630, "loss": 0.447, "lr": 4.120870474035687e-06, "epoch": 2.7607361963190185, "percentage": 27.61, "elapsed_time": "0:30:28", "remaining_time": "1:19:55"} -{"current_steps": 451, "total_steps": 1630, "loss": 0.6127, "lr": 4.1171990108847705e-06, "epoch": 2.766871165644172, "percentage": 27.67, "elapsed_time": "0:30:31", "remaining_time": "1:19:48"} -{"current_steps": 452, "total_steps": 1630, "loss": 0.3304, "lr": 4.113521540315609e-06, "epoch": 2.773006134969325, "percentage": 27.73, "elapsed_time": "0:30:32", "remaining_time": "1:19:37"} -{"current_steps": 453, "total_steps": 1630, "loss": 0.5871, "lr": 4.109838075988922e-06, "epoch": 2.7791411042944785, "percentage": 27.79, "elapsed_time": "0:30:34", "remaining_time": "1:19:26"} -{"current_steps": 454, "total_steps": 1630, "loss": 0.3578, "lr": 4.106148631587697e-06, "epoch": 2.785276073619632, "percentage": 27.85, "elapsed_time": "0:30:36", "remaining_time": "1:19:18"} -{"current_steps": 455, "total_steps": 1630, "loss": 0.4685, "lr": 4.102453220817134e-06, "epoch": 2.791411042944785, "percentage": 27.91, "elapsed_time": "0:30:39", "remaining_time": "1:19:09"} -{"current_steps": 456, "total_steps": 1630, "loss": 0.2818, "lr": 4.098751857404595e-06, "epoch": 2.7975460122699385, "percentage": 27.98, "elapsed_time": "0:30:40", "remaining_time": "1:18:57"} -{"current_steps": 457, "total_steps": 1630, "loss": 0.3497, "lr": 4.0950445550995566e-06, "epoch": 2.8036809815950923, "percentage": 28.04, "elapsed_time": "0:30:44", "remaining_time": "1:18:53"} -{"current_steps": 458, "total_steps": 1630, "loss": 0.4954, "lr": 4.091331327673554e-06, "epoch": 2.809815950920245, "percentage": 28.1, "elapsed_time": "0:30:47", "remaining_time": "1:18:47"} -{"current_steps": 459, "total_steps": 1630, "loss": 0.3884, "lr": 4.087612188920135e-06, "epoch": 2.815950920245399, "percentage": 28.16, "elapsed_time": "0:30:50", "remaining_time": "1:18:39"} -{"current_steps": 460, "total_steps": 1630, "loss": 0.375, "lr": 4.083887152654804e-06, "epoch": 2.8220858895705523, "percentage": 28.22, "elapsed_time": "0:30:54", "remaining_time": "1:18:36"} -{"current_steps": 461, "total_steps": 1630, "loss": 0.3272, "lr": 4.080156232714976e-06, "epoch": 2.8282208588957056, "percentage": 28.28, "elapsed_time": "0:30:56", "remaining_time": "1:18:27"} -{"current_steps": 462, "total_steps": 1630, "loss": 0.2936, "lr": 4.07641944295992e-06, "epoch": 2.834355828220859, "percentage": 28.34, "elapsed_time": "0:31:00", "remaining_time": "1:18:23"} -{"current_steps": 463, "total_steps": 1630, "loss": 0.2363, "lr": 4.072676797270708e-06, "epoch": 2.8404907975460123, "percentage": 28.4, "elapsed_time": "0:31:02", "remaining_time": "1:18:13"} -{"current_steps": 464, "total_steps": 1630, "loss": 0.4827, "lr": 4.0689283095501684e-06, "epoch": 2.8466257668711656, "percentage": 28.47, "elapsed_time": "0:31:05", "remaining_time": "1:18:07"} -{"current_steps": 465, "total_steps": 1630, "loss": 0.3163, "lr": 4.06517399372283e-06, "epoch": 2.852760736196319, "percentage": 28.53, "elapsed_time": "0:31:06", "remaining_time": "1:17:56"} -{"current_steps": 466, "total_steps": 1630, "loss": 0.2827, "lr": 4.061413863734869e-06, "epoch": 2.8588957055214723, "percentage": 28.59, "elapsed_time": "0:31:08", "remaining_time": "1:17:46"} -{"current_steps": 467, "total_steps": 1630, "loss": 0.3466, "lr": 4.057647933554063e-06, "epoch": 2.8650306748466257, "percentage": 28.65, "elapsed_time": "0:31:09", "remaining_time": "1:17:36"} -{"current_steps": 468, "total_steps": 1630, "loss": 0.4632, "lr": 4.053876217169734e-06, "epoch": 2.871165644171779, "percentage": 28.71, "elapsed_time": "0:31:13", "remaining_time": "1:17:31"} -{"current_steps": 469, "total_steps": 1630, "loss": 0.2001, "lr": 4.050098728592698e-06, "epoch": 2.8773006134969323, "percentage": 28.77, "elapsed_time": "0:31:16", "remaining_time": "1:17:24"} -{"current_steps": 470, "total_steps": 1630, "loss": 0.5425, "lr": 4.046315481855211e-06, "epoch": 2.883435582822086, "percentage": 28.83, "elapsed_time": "0:31:18", "remaining_time": "1:17:16"} -{"current_steps": 471, "total_steps": 1630, "loss": 0.424, "lr": 4.0425264910109245e-06, "epoch": 2.889570552147239, "percentage": 28.9, "elapsed_time": "0:31:20", "remaining_time": "1:17:07"} -{"current_steps": 472, "total_steps": 1630, "loss": 0.2443, "lr": 4.03873177013482e-06, "epoch": 2.895705521472393, "percentage": 28.96, "elapsed_time": "0:31:21", "remaining_time": "1:16:55"} -{"current_steps": 473, "total_steps": 1630, "loss": 0.3734, "lr": 4.034931333323173e-06, "epoch": 2.901840490797546, "percentage": 29.02, "elapsed_time": "0:31:24", "remaining_time": "1:16:50"} -{"current_steps": 474, "total_steps": 1630, "loss": 0.3762, "lr": 4.031125194693484e-06, "epoch": 2.9079754601226995, "percentage": 29.08, "elapsed_time": "0:31:30", "remaining_time": "1:16:49"} -{"current_steps": 475, "total_steps": 1630, "loss": 0.3721, "lr": 4.0273133683844375e-06, "epoch": 2.914110429447853, "percentage": 29.14, "elapsed_time": "0:31:33", "remaining_time": "1:16:43"} -{"current_steps": 476, "total_steps": 1630, "loss": 0.2868, "lr": 4.023495868555848e-06, "epoch": 2.920245398773006, "percentage": 29.2, "elapsed_time": "0:31:36", "remaining_time": "1:16:37"} -{"current_steps": 477, "total_steps": 1630, "loss": 0.5086, "lr": 4.0196727093886024e-06, "epoch": 2.9263803680981595, "percentage": 29.26, "elapsed_time": "0:31:38", "remaining_time": "1:16:29"} -{"current_steps": 478, "total_steps": 1630, "loss": 0.4616, "lr": 4.015843905084612e-06, "epoch": 2.932515337423313, "percentage": 29.33, "elapsed_time": "0:31:41", "remaining_time": "1:16:22"} -{"current_steps": 479, "total_steps": 1630, "loss": 0.403, "lr": 4.012009469866756e-06, "epoch": 2.938650306748466, "percentage": 29.39, "elapsed_time": "0:31:43", "remaining_time": "1:16:13"} -{"current_steps": 480, "total_steps": 1630, "loss": 0.5801, "lr": 4.008169417978836e-06, "epoch": 2.9447852760736195, "percentage": 29.45, "elapsed_time": "0:31:45", "remaining_time": "1:16:04"} -{"current_steps": 481, "total_steps": 1630, "loss": 0.5808, "lr": 4.004323763685511e-06, "epoch": 2.950920245398773, "percentage": 29.51, "elapsed_time": "0:31:51", "remaining_time": "1:16:05"} -{"current_steps": 482, "total_steps": 1630, "loss": 0.2584, "lr": 4.0004725212722565e-06, "epoch": 2.957055214723926, "percentage": 29.57, "elapsed_time": "0:31:53", "remaining_time": "1:15:57"} -{"current_steps": 483, "total_steps": 1630, "loss": 0.462, "lr": 3.996615705045302e-06, "epoch": 2.96319018404908, "percentage": 29.63, "elapsed_time": "0:32:00", "remaining_time": "1:16:01"} -{"current_steps": 484, "total_steps": 1630, "loss": 0.3502, "lr": 3.992753329331588e-06, "epoch": 2.969325153374233, "percentage": 29.69, "elapsed_time": "0:32:03", "remaining_time": "1:15:54"} -{"current_steps": 485, "total_steps": 1630, "loss": 0.5989, "lr": 3.9888854084786995e-06, "epoch": 2.9754601226993866, "percentage": 29.75, "elapsed_time": "0:32:05", "remaining_time": "1:15:45"} -{"current_steps": 486, "total_steps": 1630, "loss": 0.6772, "lr": 3.985011956854826e-06, "epoch": 2.98159509202454, "percentage": 29.82, "elapsed_time": "0:32:07", "remaining_time": "1:15:36"} -{"current_steps": 487, "total_steps": 1630, "loss": 0.4192, "lr": 3.9811329888487004e-06, "epoch": 2.9877300613496933, "percentage": 29.88, "elapsed_time": "0:32:10", "remaining_time": "1:15:31"} -{"current_steps": 488, "total_steps": 1630, "loss": 0.4031, "lr": 3.977248518869545e-06, "epoch": 2.9938650306748467, "percentage": 29.94, "elapsed_time": "0:32:13", "remaining_time": "1:15:25"} -{"current_steps": 489, "total_steps": 1630, "loss": 0.7764, "lr": 3.973358561347024e-06, "epoch": 3.0, "percentage": 30.0, "elapsed_time": "0:32:17", "remaining_time": "1:15:20"} -{"current_steps": 490, "total_steps": 1630, "loss": 0.3267, "lr": 3.969463130731183e-06, "epoch": 3.0061349693251533, "percentage": 30.06, "elapsed_time": "0:36:02", "remaining_time": "1:23:51"} -{"current_steps": 491, "total_steps": 1630, "loss": 0.2719, "lr": 3.965562241492401e-06, "epoch": 3.0122699386503067, "percentage": 30.12, "elapsed_time": "0:36:04", "remaining_time": "1:23:41"} -{"current_steps": 492, "total_steps": 1630, "loss": 0.1825, "lr": 3.9616559081213335e-06, "epoch": 3.01840490797546, "percentage": 30.18, "elapsed_time": "0:36:06", "remaining_time": "1:23:31"} -{"current_steps": 493, "total_steps": 1630, "loss": 0.1854, "lr": 3.957744145128858e-06, "epoch": 3.0245398773006134, "percentage": 30.25, "elapsed_time": "0:36:10", "remaining_time": "1:23:24"} -{"current_steps": 494, "total_steps": 1630, "loss": 0.2224, "lr": 3.953826967046021e-06, "epoch": 3.0306748466257667, "percentage": 30.31, "elapsed_time": "0:36:16", "remaining_time": "1:23:25"} -{"current_steps": 495, "total_steps": 1630, "loss": 0.349, "lr": 3.9499043884239894e-06, "epoch": 3.03680981595092, "percentage": 30.37, "elapsed_time": "0:36:18", "remaining_time": "1:23:14"} -{"current_steps": 496, "total_steps": 1630, "loss": 0.175, "lr": 3.945976423833987e-06, "epoch": 3.042944785276074, "percentage": 30.43, "elapsed_time": "0:36:19", "remaining_time": "1:23:03"} -{"current_steps": 497, "total_steps": 1630, "loss": 0.2773, "lr": 3.942043087867244e-06, "epoch": 3.049079754601227, "percentage": 30.49, "elapsed_time": "0:36:23", "remaining_time": "1:22:57"} -{"current_steps": 498, "total_steps": 1630, "loss": 0.4445, "lr": 3.938104395134947e-06, "epoch": 3.0552147239263805, "percentage": 30.55, "elapsed_time": "0:36:26", "remaining_time": "1:22:50"} -{"current_steps": 499, "total_steps": 1630, "loss": 0.3046, "lr": 3.9341603602681805e-06, "epoch": 3.061349693251534, "percentage": 30.61, "elapsed_time": "0:36:27", "remaining_time": "1:22:38"} -{"current_steps": 500, "total_steps": 1630, "loss": 0.2544, "lr": 3.930210997917871e-06, "epoch": 3.067484662576687, "percentage": 30.67, "elapsed_time": "0:36:29", "remaining_time": "1:22:29"} -{"current_steps": 501, "total_steps": 1630, "loss": 0.3154, "lr": 3.92625632275474e-06, "epoch": 3.0736196319018405, "percentage": 30.74, "elapsed_time": "0:36:33", "remaining_time": "1:22:22"} -{"current_steps": 502, "total_steps": 1630, "loss": 0.2804, "lr": 3.922296349469239e-06, "epoch": 3.079754601226994, "percentage": 30.8, "elapsed_time": "0:36:36", "remaining_time": "1:22:15"} -{"current_steps": 503, "total_steps": 1630, "loss": 0.2393, "lr": 3.918331092771505e-06, "epoch": 3.085889570552147, "percentage": 30.86, "elapsed_time": "0:36:42", "remaining_time": "1:22:14"} -{"current_steps": 504, "total_steps": 1630, "loss": 0.1403, "lr": 3.914360567391296e-06, "epoch": 3.0920245398773005, "percentage": 30.92, "elapsed_time": "0:36:46", "remaining_time": "1:22:08"} -{"current_steps": 505, "total_steps": 1630, "loss": 0.1537, "lr": 3.910384788077949e-06, "epoch": 3.098159509202454, "percentage": 30.98, "elapsed_time": "0:36:47", "remaining_time": "1:21:58"} -{"current_steps": 506, "total_steps": 1630, "loss": 0.2921, "lr": 3.906403769600311e-06, "epoch": 3.104294478527607, "percentage": 31.04, "elapsed_time": "0:36:51", "remaining_time": "1:21:52"} -{"current_steps": 507, "total_steps": 1630, "loss": 0.2036, "lr": 3.902417526746694e-06, "epoch": 3.1104294478527605, "percentage": 31.1, "elapsed_time": "0:36:54", "remaining_time": "1:21:44"} -{"current_steps": 508, "total_steps": 1630, "loss": 0.2655, "lr": 3.898426074324818e-06, "epoch": 3.116564417177914, "percentage": 31.17, "elapsed_time": "0:36:55", "remaining_time": "1:21:33"} -{"current_steps": 509, "total_steps": 1630, "loss": 0.3938, "lr": 3.8944294271617524e-06, "epoch": 3.1226993865030677, "percentage": 31.23, "elapsed_time": "0:36:58", "remaining_time": "1:21:26"} -{"current_steps": 510, "total_steps": 1630, "loss": 0.3051, "lr": 3.890427600103865e-06, "epoch": 3.128834355828221, "percentage": 31.29, "elapsed_time": "0:37:00", "remaining_time": "1:21:17"} -{"current_steps": 511, "total_steps": 1630, "loss": 0.3719, "lr": 3.886420608016767e-06, "epoch": 3.1349693251533743, "percentage": 31.35, "elapsed_time": "0:37:03", "remaining_time": "1:21:08"} -{"current_steps": 512, "total_steps": 1630, "loss": 0.1863, "lr": 3.882408465785252e-06, "epoch": 3.1411042944785277, "percentage": 31.41, "elapsed_time": "0:37:04", "remaining_time": "1:20:56"} -{"current_steps": 513, "total_steps": 1630, "loss": 0.1479, "lr": 3.878391188313249e-06, "epoch": 3.147239263803681, "percentage": 31.47, "elapsed_time": "0:37:07", "remaining_time": "1:20:49"} -{"current_steps": 514, "total_steps": 1630, "loss": 0.238, "lr": 3.87436879052376e-06, "epoch": 3.1533742331288344, "percentage": 31.53, "elapsed_time": "0:37:08", "remaining_time": "1:20:38"} -{"current_steps": 515, "total_steps": 1630, "loss": 0.2069, "lr": 3.870341287358809e-06, "epoch": 3.1595092024539877, "percentage": 31.6, "elapsed_time": "0:37:12", "remaining_time": "1:20:33"} -{"current_steps": 516, "total_steps": 1630, "loss": 0.1189, "lr": 3.8663086937793845e-06, "epoch": 3.165644171779141, "percentage": 31.66, "elapsed_time": "0:37:15", "remaining_time": "1:20:26"} -{"current_steps": 517, "total_steps": 1630, "loss": 0.3434, "lr": 3.862271024765385e-06, "epoch": 3.1717791411042944, "percentage": 31.72, "elapsed_time": "0:37:16", "remaining_time": "1:20:15"} -{"current_steps": 518, "total_steps": 1630, "loss": 0.1602, "lr": 3.8582282953155626e-06, "epoch": 3.1779141104294477, "percentage": 31.78, "elapsed_time": "0:37:18", "remaining_time": "1:20:05"} -{"current_steps": 519, "total_steps": 1630, "loss": 0.3452, "lr": 3.854180520447465e-06, "epoch": 3.184049079754601, "percentage": 31.84, "elapsed_time": "0:37:21", "remaining_time": "1:19:57"} -{"current_steps": 520, "total_steps": 1630, "loss": 0.2832, "lr": 3.850127715197387e-06, "epoch": 3.190184049079755, "percentage": 31.9, "elapsed_time": "0:37:22", "remaining_time": "1:19:47"} -{"current_steps": 521, "total_steps": 1630, "loss": 0.1481, "lr": 3.846069894620306e-06, "epoch": 3.196319018404908, "percentage": 31.96, "elapsed_time": "0:37:25", "remaining_time": "1:19:39"} -{"current_steps": 522, "total_steps": 1630, "loss": 0.1283, "lr": 3.84200707378983e-06, "epoch": 3.2024539877300615, "percentage": 32.02, "elapsed_time": "0:37:28", "remaining_time": "1:19:31"} -{"current_steps": 523, "total_steps": 1630, "loss": 0.2468, "lr": 3.8379392677981434e-06, "epoch": 3.208588957055215, "percentage": 32.09, "elapsed_time": "0:37:32", "remaining_time": "1:19:27"} -{"current_steps": 524, "total_steps": 1630, "loss": 0.2685, "lr": 3.833866491755947e-06, "epoch": 3.214723926380368, "percentage": 32.15, "elapsed_time": "0:37:38", "remaining_time": "1:19:26"} -{"current_steps": 525, "total_steps": 1630, "loss": 0.2595, "lr": 3.8297887607924044e-06, "epoch": 3.2208588957055215, "percentage": 32.21, "elapsed_time": "0:37:40", "remaining_time": "1:19:17"} -{"current_steps": 526, "total_steps": 1630, "loss": 0.4099, "lr": 3.825706090055088e-06, "epoch": 3.226993865030675, "percentage": 32.27, "elapsed_time": "0:37:46", "remaining_time": "1:19:17"} -{"current_steps": 527, "total_steps": 1630, "loss": 0.287, "lr": 3.821618494709916e-06, "epoch": 3.233128834355828, "percentage": 32.33, "elapsed_time": "0:37:48", "remaining_time": "1:19:08"} -{"current_steps": 528, "total_steps": 1630, "loss": 0.2369, "lr": 3.817525989941102e-06, "epoch": 3.2392638036809815, "percentage": 32.39, "elapsed_time": "0:37:49", "remaining_time": "1:18:56"} -{"current_steps": 529, "total_steps": 1630, "loss": 0.2751, "lr": 3.8134285909510972e-06, "epoch": 3.245398773006135, "percentage": 32.45, "elapsed_time": "0:37:55", "remaining_time": "1:18:55"} -{"current_steps": 530, "total_steps": 1630, "loss": 0.2363, "lr": 3.8093263129605305e-06, "epoch": 3.2515337423312882, "percentage": 32.52, "elapsed_time": "0:37:56", "remaining_time": "1:18:45"} -{"current_steps": 531, "total_steps": 1630, "loss": 0.094, "lr": 3.80521917120816e-06, "epoch": 3.2576687116564416, "percentage": 32.58, "elapsed_time": "0:37:59", "remaining_time": "1:18:37"} -{"current_steps": 532, "total_steps": 1630, "loss": 0.4117, "lr": 3.801107180950806e-06, "epoch": 3.263803680981595, "percentage": 32.64, "elapsed_time": "0:38:05", "remaining_time": "1:18:36"} -{"current_steps": 533, "total_steps": 1630, "loss": 0.1183, "lr": 3.7969903574633028e-06, "epoch": 3.2699386503067487, "percentage": 32.7, "elapsed_time": "0:38:07", "remaining_time": "1:18:29"} -{"current_steps": 534, "total_steps": 1630, "loss": 0.2296, "lr": 3.792868716038437e-06, "epoch": 3.276073619631902, "percentage": 32.76, "elapsed_time": "0:38:10", "remaining_time": "1:18:22"} -{"current_steps": 535, "total_steps": 1630, "loss": 0.2678, "lr": 3.7887422719868937e-06, "epoch": 3.2822085889570554, "percentage": 32.82, "elapsed_time": "0:38:12", "remaining_time": "1:18:11"} -{"current_steps": 536, "total_steps": 1630, "loss": 0.4887, "lr": 3.784611040637198e-06, "epoch": 3.2883435582822087, "percentage": 32.88, "elapsed_time": "0:38:14", "remaining_time": "1:18:02"} -{"current_steps": 537, "total_steps": 1630, "loss": 0.3827, "lr": 3.7804750373356576e-06, "epoch": 3.294478527607362, "percentage": 32.94, "elapsed_time": "0:38:19", "remaining_time": "1:18:01"} -{"current_steps": 538, "total_steps": 1630, "loss": 0.3233, "lr": 3.776334277446307e-06, "epoch": 3.3006134969325154, "percentage": 33.01, "elapsed_time": "0:38:21", "remaining_time": "1:17:51"} -{"current_steps": 539, "total_steps": 1630, "loss": 0.1256, "lr": 3.7721887763508512e-06, "epoch": 3.3067484662576687, "percentage": 33.07, "elapsed_time": "0:38:23", "remaining_time": "1:17:41"} -{"current_steps": 540, "total_steps": 1630, "loss": 0.3845, "lr": 3.7680385494486053e-06, "epoch": 3.312883435582822, "percentage": 33.13, "elapsed_time": "0:38:24", "remaining_time": "1:17:31"} -{"current_steps": 541, "total_steps": 1630, "loss": 0.2905, "lr": 3.7638836121564414e-06, "epoch": 3.3190184049079754, "percentage": 33.19, "elapsed_time": "0:38:30", "remaining_time": "1:17:30"} -{"current_steps": 542, "total_steps": 1630, "loss": 0.3561, "lr": 3.7597239799087283e-06, "epoch": 3.3251533742331287, "percentage": 33.25, "elapsed_time": "0:38:31", "remaining_time": "1:17:20"} -{"current_steps": 543, "total_steps": 1630, "loss": 0.1157, "lr": 3.7555596681572736e-06, "epoch": 3.331288343558282, "percentage": 33.31, "elapsed_time": "0:38:32", "remaining_time": "1:17:10"} -{"current_steps": 544, "total_steps": 1630, "loss": 0.3049, "lr": 3.751390692371272e-06, "epoch": 3.3374233128834354, "percentage": 33.37, "elapsed_time": "0:38:35", "remaining_time": "1:17:03"} -{"current_steps": 545, "total_steps": 1630, "loss": 0.1626, "lr": 3.7472170680372398e-06, "epoch": 3.3435582822085887, "percentage": 33.44, "elapsed_time": "0:38:38", "remaining_time": "1:16:56"} -{"current_steps": 546, "total_steps": 1630, "loss": 0.2414, "lr": 3.7430388106589632e-06, "epoch": 3.3496932515337425, "percentage": 33.5, "elapsed_time": "0:38:41", "remaining_time": "1:16:49"} -{"current_steps": 547, "total_steps": 1630, "loss": 0.3441, "lr": 3.738855935757438e-06, "epoch": 3.355828220858896, "percentage": 33.56, "elapsed_time": "0:38:46", "remaining_time": "1:16:45"} -{"current_steps": 548, "total_steps": 1630, "loss": 0.5244, "lr": 3.7346684588708135e-06, "epoch": 3.361963190184049, "percentage": 33.62, "elapsed_time": "0:38:47", "remaining_time": "1:16:36"} -{"current_steps": 549, "total_steps": 1630, "loss": 0.1984, "lr": 3.7304763955543332e-06, "epoch": 3.3680981595092025, "percentage": 33.68, "elapsed_time": "0:38:49", "remaining_time": "1:16:27"} -{"current_steps": 550, "total_steps": 1630, "loss": 0.2715, "lr": 3.726279761380279e-06, "epoch": 3.374233128834356, "percentage": 33.74, "elapsed_time": "0:38:51", "remaining_time": "1:16:18"} -{"current_steps": 551, "total_steps": 1630, "loss": 0.1537, "lr": 3.72207857193791e-06, "epoch": 3.3803680981595092, "percentage": 33.8, "elapsed_time": "0:38:53", "remaining_time": "1:16:10"} -{"current_steps": 552, "total_steps": 1630, "loss": 0.2388, "lr": 3.7178728428334092e-06, "epoch": 3.3865030674846626, "percentage": 33.87, "elapsed_time": "0:38:55", "remaining_time": "1:16:01"} -{"current_steps": 553, "total_steps": 1630, "loss": 0.1726, "lr": 3.7136625896898226e-06, "epoch": 3.392638036809816, "percentage": 33.93, "elapsed_time": "0:38:57", "remaining_time": "1:15:52"} -{"current_steps": 554, "total_steps": 1630, "loss": 0.2942, "lr": 3.7094478281470003e-06, "epoch": 3.3987730061349692, "percentage": 33.99, "elapsed_time": "0:38:59", "remaining_time": "1:15:43"} -{"current_steps": 555, "total_steps": 1630, "loss": 0.1665, "lr": 3.7052285738615412e-06, "epoch": 3.4049079754601226, "percentage": 34.05, "elapsed_time": "0:39:02", "remaining_time": "1:15:37"} -{"current_steps": 556, "total_steps": 1630, "loss": 0.3954, "lr": 3.7010048425067317e-06, "epoch": 3.411042944785276, "percentage": 34.11, "elapsed_time": "0:39:06", "remaining_time": "1:15:31"} -{"current_steps": 557, "total_steps": 1630, "loss": 0.3207, "lr": 3.696776649772492e-06, "epoch": 3.4171779141104293, "percentage": 34.17, "elapsed_time": "0:39:12", "remaining_time": "1:15:31"} -{"current_steps": 558, "total_steps": 1630, "loss": 0.1325, "lr": 3.692544011365312e-06, "epoch": 3.4233128834355826, "percentage": 34.23, "elapsed_time": "0:39:13", "remaining_time": "1:15:21"} -{"current_steps": 559, "total_steps": 1630, "loss": 0.1644, "lr": 3.6883069430081986e-06, "epoch": 3.4294478527607364, "percentage": 34.29, "elapsed_time": "0:39:16", "remaining_time": "1:15:14"} -{"current_steps": 560, "total_steps": 1630, "loss": 0.2469, "lr": 3.6840654604406135e-06, "epoch": 3.4355828220858897, "percentage": 34.36, "elapsed_time": "0:39:20", "remaining_time": "1:15:09"} -{"current_steps": 561, "total_steps": 1630, "loss": 0.1146, "lr": 3.679819579418414e-06, "epoch": 3.441717791411043, "percentage": 34.42, "elapsed_time": "0:39:22", "remaining_time": "1:15:01"} -{"current_steps": 562, "total_steps": 1630, "loss": 0.3236, "lr": 3.6755693157137995e-06, "epoch": 3.4478527607361964, "percentage": 34.48, "elapsed_time": "0:39:25", "remaining_time": "1:14:54"} -{"current_steps": 563, "total_steps": 1630, "loss": 0.399, "lr": 3.6713146851152487e-06, "epoch": 3.4539877300613497, "percentage": 34.54, "elapsed_time": "0:39:28", "remaining_time": "1:14:48"} -{"current_steps": 564, "total_steps": 1630, "loss": 0.1259, "lr": 3.667055703427461e-06, "epoch": 3.460122699386503, "percentage": 34.6, "elapsed_time": "0:39:30", "remaining_time": "1:14:39"} -{"current_steps": 565, "total_steps": 1630, "loss": 0.1835, "lr": 3.6627923864713e-06, "epoch": 3.4662576687116564, "percentage": 34.66, "elapsed_time": "0:39:33", "remaining_time": "1:14:33"} -{"current_steps": 566, "total_steps": 1630, "loss": 0.2763, "lr": 3.658524750083733e-06, "epoch": 3.4723926380368098, "percentage": 34.72, "elapsed_time": "0:39:34", "remaining_time": "1:14:24"} -{"current_steps": 567, "total_steps": 1630, "loss": 0.2496, "lr": 3.654252810117773e-06, "epoch": 3.478527607361963, "percentage": 34.79, "elapsed_time": "0:39:39", "remaining_time": "1:14:20"} -{"current_steps": 568, "total_steps": 1630, "loss": 0.1287, "lr": 3.6499765824424195e-06, "epoch": 3.4846625766871164, "percentage": 34.85, "elapsed_time": "0:39:41", "remaining_time": "1:14:13"} -{"current_steps": 569, "total_steps": 1630, "loss": 0.1747, "lr": 3.6456960829425987e-06, "epoch": 3.4907975460122698, "percentage": 34.91, "elapsed_time": "0:39:44", "remaining_time": "1:14:05"} -{"current_steps": 570, "total_steps": 1630, "loss": 0.1913, "lr": 3.641411327519107e-06, "epoch": 3.4969325153374236, "percentage": 34.97, "elapsed_time": "0:39:48", "remaining_time": "1:14:02"} -{"current_steps": 571, "total_steps": 1630, "loss": 0.3224, "lr": 3.6371223320885492e-06, "epoch": 3.5030674846625764, "percentage": 35.03, "elapsed_time": "0:39:51", "remaining_time": "1:13:54"} -{"current_steps": 572, "total_steps": 1630, "loss": 0.2364, "lr": 3.6328291125832803e-06, "epoch": 3.5092024539877302, "percentage": 35.09, "elapsed_time": "0:39:52", "remaining_time": "1:13:45"} -{"current_steps": 573, "total_steps": 1630, "loss": 0.2552, "lr": 3.628531684951347e-06, "epoch": 3.5153374233128836, "percentage": 35.15, "elapsed_time": "0:39:54", "remaining_time": "1:13:37"} -{"current_steps": 574, "total_steps": 1630, "loss": 0.3232, "lr": 3.6242300651564276e-06, "epoch": 3.521472392638037, "percentage": 35.21, "elapsed_time": "0:39:56", "remaining_time": "1:13:29"} -{"current_steps": 575, "total_steps": 1630, "loss": 0.32, "lr": 3.6199242691777745e-06, "epoch": 3.5276073619631902, "percentage": 35.28, "elapsed_time": "0:40:04", "remaining_time": "1:13:31"} -{"current_steps": 576, "total_steps": 1630, "loss": 0.2922, "lr": 3.6156143130101516e-06, "epoch": 3.5337423312883436, "percentage": 35.34, "elapsed_time": "0:40:06", "remaining_time": "1:13:23"} -{"current_steps": 577, "total_steps": 1630, "loss": 0.2005, "lr": 3.6113002126637765e-06, "epoch": 3.539877300613497, "percentage": 35.4, "elapsed_time": "0:40:09", "remaining_time": "1:13:17"} -{"current_steps": 578, "total_steps": 1630, "loss": 0.2138, "lr": 3.606981984164263e-06, "epoch": 3.5460122699386503, "percentage": 35.46, "elapsed_time": "0:40:14", "remaining_time": "1:13:13"} -{"current_steps": 579, "total_steps": 1630, "loss": 0.4382, "lr": 3.6026596435525578e-06, "epoch": 3.5521472392638036, "percentage": 35.52, "elapsed_time": "0:40:16", "remaining_time": "1:13:06"} -{"current_steps": 580, "total_steps": 1630, "loss": 0.3326, "lr": 3.5983332068848855e-06, "epoch": 3.558282208588957, "percentage": 35.58, "elapsed_time": "0:40:21", "remaining_time": "1:13:04"} -{"current_steps": 581, "total_steps": 1630, "loss": 0.4748, "lr": 3.5940026902326825e-06, "epoch": 3.5644171779141103, "percentage": 35.64, "elapsed_time": "0:40:23", "remaining_time": "1:12:56"} -{"current_steps": 582, "total_steps": 1630, "loss": 0.2692, "lr": 3.5896681096825446e-06, "epoch": 3.5705521472392636, "percentage": 35.71, "elapsed_time": "0:40:26", "remaining_time": "1:12:49"} -{"current_steps": 583, "total_steps": 1630, "loss": 0.3658, "lr": 3.5853294813361614e-06, "epoch": 3.5766871165644174, "percentage": 35.77, "elapsed_time": "0:40:30", "remaining_time": "1:12:45"} -{"current_steps": 584, "total_steps": 1630, "loss": 0.1661, "lr": 3.5809868213102623e-06, "epoch": 3.5828220858895703, "percentage": 35.83, "elapsed_time": "0:40:33", "remaining_time": "1:12:38"} -{"current_steps": 585, "total_steps": 1630, "loss": 0.1233, "lr": 3.5766401457365485e-06, "epoch": 3.588957055214724, "percentage": 35.89, "elapsed_time": "0:40:34", "remaining_time": "1:12:28"} -{"current_steps": 586, "total_steps": 1630, "loss": 0.278, "lr": 3.5722894707616417e-06, "epoch": 3.5950920245398774, "percentage": 35.95, "elapsed_time": "0:40:37", "remaining_time": "1:12:22"} -{"current_steps": 587, "total_steps": 1630, "loss": 0.1541, "lr": 3.5679348125470175e-06, "epoch": 3.6012269938650308, "percentage": 36.01, "elapsed_time": "0:40:39", "remaining_time": "1:12:15"} -{"current_steps": 588, "total_steps": 1630, "loss": 0.1575, "lr": 3.56357618726895e-06, "epoch": 3.607361963190184, "percentage": 36.07, "elapsed_time": "0:40:43", "remaining_time": "1:12:10"} -{"current_steps": 589, "total_steps": 1630, "loss": 0.8079, "lr": 3.5592136111184483e-06, "epoch": 3.6134969325153374, "percentage": 36.13, "elapsed_time": "0:40:46", "remaining_time": "1:12:03"} -{"current_steps": 590, "total_steps": 1630, "loss": 0.341, "lr": 3.554847100301199e-06, "epoch": 3.6196319018404908, "percentage": 36.2, "elapsed_time": "0:40:49", "remaining_time": "1:11:57"} -{"current_steps": 591, "total_steps": 1630, "loss": 0.1625, "lr": 3.550476671037505e-06, "epoch": 3.625766871165644, "percentage": 36.26, "elapsed_time": "0:40:53", "remaining_time": "1:11:53"} -{"current_steps": 592, "total_steps": 1630, "loss": 0.199, "lr": 3.546102339562223e-06, "epoch": 3.6319018404907975, "percentage": 36.32, "elapsed_time": "0:40:55", "remaining_time": "1:11:45"} -{"current_steps": 593, "total_steps": 1630, "loss": 0.1493, "lr": 3.5417241221247078e-06, "epoch": 3.638036809815951, "percentage": 36.38, "elapsed_time": "0:40:58", "remaining_time": "1:11:39"} -{"current_steps": 594, "total_steps": 1630, "loss": 0.2765, "lr": 3.5373420349887477e-06, "epoch": 3.644171779141104, "percentage": 36.44, "elapsed_time": "0:41:00", "remaining_time": "1:11:30"} -{"current_steps": 595, "total_steps": 1630, "loss": 0.2833, "lr": 3.5329560944325065e-06, "epoch": 3.6503067484662575, "percentage": 36.5, "elapsed_time": "0:41:07", "remaining_time": "1:11:31"} -{"current_steps": 596, "total_steps": 1630, "loss": 0.1237, "lr": 3.528566316748462e-06, "epoch": 3.6564417177914113, "percentage": 36.56, "elapsed_time": "0:41:11", "remaining_time": "1:11:27"} -{"current_steps": 597, "total_steps": 1630, "loss": 0.1599, "lr": 3.524172718243347e-06, "epoch": 3.662576687116564, "percentage": 36.63, "elapsed_time": "0:41:14", "remaining_time": "1:11:21"} -{"current_steps": 598, "total_steps": 1630, "loss": 0.2997, "lr": 3.5197753152380854e-06, "epoch": 3.668711656441718, "percentage": 36.69, "elapsed_time": "0:41:16", "remaining_time": "1:11:14"} -{"current_steps": 599, "total_steps": 1630, "loss": 0.2021, "lr": 3.515374124067736e-06, "epoch": 3.6748466257668713, "percentage": 36.75, "elapsed_time": "0:41:18", "remaining_time": "1:11:05"} -{"current_steps": 600, "total_steps": 1630, "loss": 0.1726, "lr": 3.5109691610814263e-06, "epoch": 3.6809815950920246, "percentage": 36.81, "elapsed_time": "0:41:20", "remaining_time": "1:10:57"} -{"current_steps": 601, "total_steps": 1630, "loss": 0.1377, "lr": 3.5065604426422995e-06, "epoch": 3.687116564417178, "percentage": 36.87, "elapsed_time": "0:41:21", "remaining_time": "1:10:48"} -{"current_steps": 602, "total_steps": 1630, "loss": 0.1497, "lr": 3.502147985127445e-06, "epoch": 3.6932515337423313, "percentage": 36.93, "elapsed_time": "0:41:23", "remaining_time": "1:10:40"} -{"current_steps": 603, "total_steps": 1630, "loss": 0.1589, "lr": 3.4977318049278443e-06, "epoch": 3.6993865030674846, "percentage": 36.99, "elapsed_time": "0:41:27", "remaining_time": "1:10:36"} -{"current_steps": 604, "total_steps": 1630, "loss": 0.1364, "lr": 3.4933119184483065e-06, "epoch": 3.705521472392638, "percentage": 37.06, "elapsed_time": "0:41:29", "remaining_time": "1:10:29"} -{"current_steps": 605, "total_steps": 1630, "loss": 0.177, "lr": 3.4888883421074076e-06, "epoch": 3.7116564417177913, "percentage": 37.12, "elapsed_time": "0:41:32", "remaining_time": "1:10:22"} -{"current_steps": 606, "total_steps": 1630, "loss": 0.122, "lr": 3.484461092337434e-06, "epoch": 3.7177914110429446, "percentage": 37.18, "elapsed_time": "0:41:33", "remaining_time": "1:10:12"} -{"current_steps": 607, "total_steps": 1630, "loss": 0.2664, "lr": 3.4800301855843137e-06, "epoch": 3.7239263803680984, "percentage": 37.24, "elapsed_time": "0:41:35", "remaining_time": "1:10:06"} -{"current_steps": 608, "total_steps": 1630, "loss": 0.12, "lr": 3.4755956383075613e-06, "epoch": 3.7300613496932513, "percentage": 37.3, "elapsed_time": "0:41:38", "remaining_time": "1:10:00"} -{"current_steps": 609, "total_steps": 1630, "loss": 0.3926, "lr": 3.471157466980214e-06, "epoch": 3.736196319018405, "percentage": 37.36, "elapsed_time": "0:41:43", "remaining_time": "1:09:56"} -{"current_steps": 610, "total_steps": 1630, "loss": 0.6233, "lr": 3.466715688088772e-06, "epoch": 3.7423312883435584, "percentage": 37.42, "elapsed_time": "0:41:45", "remaining_time": "1:09:49"} -{"current_steps": 611, "total_steps": 1630, "loss": 0.2456, "lr": 3.462270318133136e-06, "epoch": 3.7484662576687118, "percentage": 37.48, "elapsed_time": "0:41:47", "remaining_time": "1:09:42"} -{"current_steps": 612, "total_steps": 1630, "loss": 0.2683, "lr": 3.4578213736265474e-06, "epoch": 3.754601226993865, "percentage": 37.55, "elapsed_time": "0:41:49", "remaining_time": "1:09:33"} -{"current_steps": 613, "total_steps": 1630, "loss": 0.3796, "lr": 3.4533688710955255e-06, "epoch": 3.7607361963190185, "percentage": 37.61, "elapsed_time": "0:41:52", "remaining_time": "1:09:28"} -{"current_steps": 614, "total_steps": 1630, "loss": 0.3326, "lr": 3.448912827079805e-06, "epoch": 3.766871165644172, "percentage": 37.67, "elapsed_time": "0:41:55", "remaining_time": "1:09:23"} -{"current_steps": 615, "total_steps": 1630, "loss": 0.206, "lr": 3.4444532581322793e-06, "epoch": 3.773006134969325, "percentage": 37.73, "elapsed_time": "0:42:01", "remaining_time": "1:09:20"} -{"current_steps": 616, "total_steps": 1630, "loss": 0.244, "lr": 3.4399901808189327e-06, "epoch": 3.7791411042944785, "percentage": 37.79, "elapsed_time": "0:42:03", "remaining_time": "1:09:13"} -{"current_steps": 617, "total_steps": 1630, "loss": 0.1796, "lr": 3.435523611718785e-06, "epoch": 3.785276073619632, "percentage": 37.85, "elapsed_time": "0:42:06", "remaining_time": "1:09:07"} -{"current_steps": 618, "total_steps": 1630, "loss": 0.188, "lr": 3.4310535674238242e-06, "epoch": 3.791411042944785, "percentage": 37.91, "elapsed_time": "0:42:07", "remaining_time": "1:08:58"} -{"current_steps": 619, "total_steps": 1630, "loss": 0.3039, "lr": 3.42658006453895e-06, "epoch": 3.7975460122699385, "percentage": 37.98, "elapsed_time": "0:42:10", "remaining_time": "1:08:53"} -{"current_steps": 620, "total_steps": 1630, "loss": 0.3383, "lr": 3.4221031196819083e-06, "epoch": 3.8036809815950923, "percentage": 38.04, "elapsed_time": "0:42:12", "remaining_time": "1:08:45"} -{"current_steps": 621, "total_steps": 1630, "loss": 0.1721, "lr": 3.4176227494832305e-06, "epoch": 3.809815950920245, "percentage": 38.1, "elapsed_time": "0:42:14", "remaining_time": "1:08:38"} -{"current_steps": 622, "total_steps": 1630, "loss": 0.2211, "lr": 3.413138970586174e-06, "epoch": 3.815950920245399, "percentage": 38.16, "elapsed_time": "0:42:19", "remaining_time": "1:08:35"} -{"current_steps": 623, "total_steps": 1630, "loss": 0.1871, "lr": 3.4086517996466574e-06, "epoch": 3.8220858895705523, "percentage": 38.22, "elapsed_time": "0:42:21", "remaining_time": "1:08:27"} -{"current_steps": 624, "total_steps": 1630, "loss": 0.3874, "lr": 3.404161253333199e-06, "epoch": 3.8282208588957056, "percentage": 38.28, "elapsed_time": "0:42:23", "remaining_time": "1:08:19"} -{"current_steps": 625, "total_steps": 1630, "loss": 0.1739, "lr": 3.3996673483268573e-06, "epoch": 3.834355828220859, "percentage": 38.34, "elapsed_time": "0:42:24", "remaining_time": "1:08:11"} -{"current_steps": 626, "total_steps": 1630, "loss": 0.274, "lr": 3.3951701013211665e-06, "epoch": 3.8404907975460123, "percentage": 38.4, "elapsed_time": "0:42:29", "remaining_time": "1:08:08"} -{"current_steps": 627, "total_steps": 1630, "loss": 0.3568, "lr": 3.3906695290220736e-06, "epoch": 3.8466257668711656, "percentage": 38.47, "elapsed_time": "0:42:30", "remaining_time": "1:08:00"} -{"current_steps": 628, "total_steps": 1630, "loss": 0.157, "lr": 3.3861656481478816e-06, "epoch": 3.852760736196319, "percentage": 38.53, "elapsed_time": "0:42:32", "remaining_time": "1:07:53"} -{"current_steps": 629, "total_steps": 1630, "loss": 0.1218, "lr": 3.3816584754291814e-06, "epoch": 3.8588957055214723, "percentage": 38.59, "elapsed_time": "0:42:34", "remaining_time": "1:07:44"} -{"current_steps": 630, "total_steps": 1630, "loss": 0.2234, "lr": 3.377148027608793e-06, "epoch": 3.8650306748466257, "percentage": 38.65, "elapsed_time": "0:42:36", "remaining_time": "1:07:37"} -{"current_steps": 631, "total_steps": 1630, "loss": 0.3329, "lr": 3.3726343214417023e-06, "epoch": 3.871165644171779, "percentage": 38.71, "elapsed_time": "0:42:43", "remaining_time": "1:07:38"} -{"current_steps": 632, "total_steps": 1630, "loss": 0.1384, "lr": 3.3681173736949984e-06, "epoch": 3.8773006134969323, "percentage": 38.77, "elapsed_time": "0:42:45", "remaining_time": "1:07:30"} -{"current_steps": 633, "total_steps": 1630, "loss": 0.3807, "lr": 3.3635972011478134e-06, "epoch": 3.883435582822086, "percentage": 38.83, "elapsed_time": "0:42:50", "remaining_time": "1:07:28"} -{"current_steps": 634, "total_steps": 1630, "loss": 0.194, "lr": 3.3590738205912566e-06, "epoch": 3.889570552147239, "percentage": 38.9, "elapsed_time": "0:42:51", "remaining_time": "1:07:20"} -{"current_steps": 635, "total_steps": 1630, "loss": 0.202, "lr": 3.354547248828356e-06, "epoch": 3.895705521472393, "percentage": 38.96, "elapsed_time": "0:42:55", "remaining_time": "1:07:16"} -{"current_steps": 636, "total_steps": 1630, "loss": 0.2471, "lr": 3.3500175026739916e-06, "epoch": 3.901840490797546, "percentage": 39.02, "elapsed_time": "0:42:57", "remaining_time": "1:07:07"} -{"current_steps": 637, "total_steps": 1630, "loss": 0.1112, "lr": 3.3454845989548385e-06, "epoch": 3.9079754601226995, "percentage": 39.08, "elapsed_time": "0:42:58", "remaining_time": "1:06:59"} -{"current_steps": 638, "total_steps": 1630, "loss": 0.3368, "lr": 3.3409485545092995e-06, "epoch": 3.914110429447853, "percentage": 39.14, "elapsed_time": "0:43:04", "remaining_time": "1:06:57"} -{"current_steps": 639, "total_steps": 1630, "loss": 0.1863, "lr": 3.336409386187444e-06, "epoch": 3.920245398773006, "percentage": 39.2, "elapsed_time": "0:43:06", "remaining_time": "1:06:50"} -{"current_steps": 640, "total_steps": 1630, "loss": 0.1491, "lr": 3.331867110850946e-06, "epoch": 3.9263803680981595, "percentage": 39.26, "elapsed_time": "0:43:10", "remaining_time": "1:06:46"} -{"current_steps": 641, "total_steps": 1630, "loss": 0.2484, "lr": 3.327321745373021e-06, "epoch": 3.932515337423313, "percentage": 39.33, "elapsed_time": "0:43:11", "remaining_time": "1:06:38"} -{"current_steps": 642, "total_steps": 1630, "loss": 0.2126, "lr": 3.322773306638364e-06, "epoch": 3.938650306748466, "percentage": 39.39, "elapsed_time": "0:43:15", "remaining_time": "1:06:33"} -{"current_steps": 643, "total_steps": 1630, "loss": 0.1649, "lr": 3.318221811543086e-06, "epoch": 3.9447852760736195, "percentage": 39.45, "elapsed_time": "0:43:18", "remaining_time": "1:06:29"} -{"current_steps": 644, "total_steps": 1630, "loss": 0.1442, "lr": 3.313667276994651e-06, "epoch": 3.950920245398773, "percentage": 39.51, "elapsed_time": "0:43:20", "remaining_time": "1:06:22"} -{"current_steps": 645, "total_steps": 1630, "loss": 0.359, "lr": 3.309109719911814e-06, "epoch": 3.957055214723926, "percentage": 39.57, "elapsed_time": "0:43:24", "remaining_time": "1:06:16"} -{"current_steps": 646, "total_steps": 1630, "loss": 0.4042, "lr": 3.304549157224558e-06, "epoch": 3.96319018404908, "percentage": 39.63, "elapsed_time": "0:43:26", "remaining_time": "1:06:09"} -{"current_steps": 647, "total_steps": 1630, "loss": 0.1699, "lr": 3.299985605874031e-06, "epoch": 3.969325153374233, "percentage": 39.69, "elapsed_time": "0:43:29", "remaining_time": "1:06:03"} -{"current_steps": 648, "total_steps": 1630, "loss": 0.1888, "lr": 3.295419082812483e-06, "epoch": 3.9754601226993866, "percentage": 39.75, "elapsed_time": "0:43:30", "remaining_time": "1:05:56"} -{"current_steps": 649, "total_steps": 1630, "loss": 0.2824, "lr": 3.2908496050032024e-06, "epoch": 3.98159509202454, "percentage": 39.82, "elapsed_time": "0:43:36", "remaining_time": "1:05:54"} -{"current_steps": 650, "total_steps": 1630, "loss": 0.3038, "lr": 3.2862771894204544e-06, "epoch": 3.9877300613496933, "percentage": 39.88, "elapsed_time": "0:43:38", "remaining_time": "1:05:48"} -{"current_steps": 651, "total_steps": 1630, "loss": 0.3266, "lr": 3.2817018530494164e-06, "epoch": 3.9938650306748467, "percentage": 39.94, "elapsed_time": "0:43:41", "remaining_time": "1:05:41"} -{"current_steps": 652, "total_steps": 1630, "loss": 0.2998, "lr": 3.277123612886116e-06, "epoch": 4.0, "percentage": 40.0, "elapsed_time": "0:43:43", "remaining_time": "1:05:34"} -{"current_steps": 653, "total_steps": 1630, "loss": 0.2764, "lr": 3.272542485937369e-06, "epoch": 4.006134969325154, "percentage": 40.06, "elapsed_time": "0:47:10", "remaining_time": "1:10:34"} -{"current_steps": 654, "total_steps": 1630, "loss": 0.1157, "lr": 3.2679584892207118e-06, "epoch": 4.012269938650307, "percentage": 40.12, "elapsed_time": "0:47:12", "remaining_time": "1:10:27"} -{"current_steps": 655, "total_steps": 1630, "loss": 0.0707, "lr": 3.263371639764343e-06, "epoch": 4.0184049079754605, "percentage": 40.18, "elapsed_time": "0:47:14", "remaining_time": "1:10:18"} -{"current_steps": 656, "total_steps": 1630, "loss": 0.1067, "lr": 3.2587819546070596e-06, "epoch": 4.024539877300613, "percentage": 40.25, "elapsed_time": "0:47:17", "remaining_time": "1:10:13"} -{"current_steps": 657, "total_steps": 1630, "loss": 0.0564, "lr": 3.254189450798189e-06, "epoch": 4.030674846625767, "percentage": 40.31, "elapsed_time": "0:47:18", "remaining_time": "1:10:03"} -{"current_steps": 658, "total_steps": 1630, "loss": 0.0535, "lr": 3.2495941453975312e-06, "epoch": 4.03680981595092, "percentage": 40.37, "elapsed_time": "0:47:22", "remaining_time": "1:09:59"} -{"current_steps": 659, "total_steps": 1630, "loss": 0.1245, "lr": 3.2449960554752935e-06, "epoch": 4.042944785276074, "percentage": 40.43, "elapsed_time": "0:47:24", "remaining_time": "1:09:50"} -{"current_steps": 660, "total_steps": 1630, "loss": 0.0626, "lr": 3.240395198112026e-06, "epoch": 4.049079754601227, "percentage": 40.49, "elapsed_time": "0:47:27", "remaining_time": "1:09:45"} -{"current_steps": 661, "total_steps": 1630, "loss": 0.1198, "lr": 3.2357915903985605e-06, "epoch": 4.0552147239263805, "percentage": 40.55, "elapsed_time": "0:47:30", "remaining_time": "1:09:38"} -{"current_steps": 662, "total_steps": 1630, "loss": 0.0454, "lr": 3.2311852494359423e-06, "epoch": 4.061349693251533, "percentage": 40.61, "elapsed_time": "0:47:32", "remaining_time": "1:09:31"} -{"current_steps": 663, "total_steps": 1630, "loss": 0.2064, "lr": 3.226576192335373e-06, "epoch": 4.067484662576687, "percentage": 40.67, "elapsed_time": "0:47:36", "remaining_time": "1:09:26"} -{"current_steps": 664, "total_steps": 1630, "loss": 0.2183, "lr": 3.2219644362181436e-06, "epoch": 4.07361963190184, "percentage": 40.74, "elapsed_time": "0:47:39", "remaining_time": "1:09:20"} -{"current_steps": 665, "total_steps": 1630, "loss": 0.0516, "lr": 3.21734999821557e-06, "epoch": 4.079754601226994, "percentage": 40.8, "elapsed_time": "0:47:42", "remaining_time": "1:09:13"} -{"current_steps": 666, "total_steps": 1630, "loss": 0.0613, "lr": 3.2127328954689307e-06, "epoch": 4.085889570552148, "percentage": 40.86, "elapsed_time": "0:47:44", "remaining_time": "1:09:06"} -{"current_steps": 667, "total_steps": 1630, "loss": 0.0583, "lr": 3.2081131451294025e-06, "epoch": 4.0920245398773005, "percentage": 40.92, "elapsed_time": "0:47:45", "remaining_time": "1:08:57"} -{"current_steps": 668, "total_steps": 1630, "loss": 0.0766, "lr": 3.2034907643579988e-06, "epoch": 4.098159509202454, "percentage": 40.98, "elapsed_time": "0:47:47", "remaining_time": "1:08:49"} -{"current_steps": 669, "total_steps": 1630, "loss": 0.1099, "lr": 3.1988657703255043e-06, "epoch": 4.104294478527607, "percentage": 41.04, "elapsed_time": "0:47:48", "remaining_time": "1:08:40"} -{"current_steps": 670, "total_steps": 1630, "loss": 0.1663, "lr": 3.194238180212409e-06, "epoch": 4.110429447852761, "percentage": 41.1, "elapsed_time": "0:47:50", "remaining_time": "1:08:33"} -{"current_steps": 671, "total_steps": 1630, "loss": 0.0587, "lr": 3.1896080112088477e-06, "epoch": 4.116564417177914, "percentage": 41.17, "elapsed_time": "0:47:53", "remaining_time": "1:08:27"} -{"current_steps": 672, "total_steps": 1630, "loss": 0.0579, "lr": 3.184975280514536e-06, "epoch": 4.122699386503068, "percentage": 41.23, "elapsed_time": "0:47:55", "remaining_time": "1:08:19"} -{"current_steps": 673, "total_steps": 1630, "loss": 0.1083, "lr": 3.1803400053387044e-06, "epoch": 4.128834355828221, "percentage": 41.29, "elapsed_time": "0:47:58", "remaining_time": "1:08:13"} -{"current_steps": 674, "total_steps": 1630, "loss": 0.1355, "lr": 3.175702202900036e-06, "epoch": 4.134969325153374, "percentage": 41.35, "elapsed_time": "0:48:04", "remaining_time": "1:08:11"} -{"current_steps": 675, "total_steps": 1630, "loss": 0.092, "lr": 3.1710618904266006e-06, "epoch": 4.141104294478527, "percentage": 41.41, "elapsed_time": "0:48:07", "remaining_time": "1:08:05"} -{"current_steps": 676, "total_steps": 1630, "loss": 0.0563, "lr": 3.166419085155793e-06, "epoch": 4.147239263803681, "percentage": 41.47, "elapsed_time": "0:48:08", "remaining_time": "1:07:56"} -{"current_steps": 677, "total_steps": 1630, "loss": 0.1773, "lr": 3.1617738043342695e-06, "epoch": 4.153374233128835, "percentage": 41.53, "elapsed_time": "0:48:13", "remaining_time": "1:07:52"} -{"current_steps": 678, "total_steps": 1630, "loss": 0.0489, "lr": 3.157126065217879e-06, "epoch": 4.159509202453988, "percentage": 41.6, "elapsed_time": "0:48:15", "remaining_time": "1:07:45"} -{"current_steps": 679, "total_steps": 1630, "loss": 0.1333, "lr": 3.152475885071606e-06, "epoch": 4.1656441717791415, "percentage": 41.66, "elapsed_time": "0:48:19", "remaining_time": "1:07:41"} -{"current_steps": 680, "total_steps": 1630, "loss": 0.1501, "lr": 3.147823281169498e-06, "epoch": 4.171779141104294, "percentage": 41.72, "elapsed_time": "0:48:24", "remaining_time": "1:07:37"} -{"current_steps": 681, "total_steps": 1630, "loss": 0.1067, "lr": 3.143168270794612e-06, "epoch": 4.177914110429448, "percentage": 41.78, "elapsed_time": "0:48:28", "remaining_time": "1:07:33"} -{"current_steps": 682, "total_steps": 1630, "loss": 0.2499, "lr": 3.1385108712389394e-06, "epoch": 4.184049079754601, "percentage": 41.84, "elapsed_time": "0:48:31", "remaining_time": "1:07:26"} -{"current_steps": 683, "total_steps": 1630, "loss": 0.1748, "lr": 3.1338510998033484e-06, "epoch": 4.190184049079755, "percentage": 41.9, "elapsed_time": "0:48:34", "remaining_time": "1:07:20"} -{"current_steps": 684, "total_steps": 1630, "loss": 0.201, "lr": 3.129188973797519e-06, "epoch": 4.196319018404908, "percentage": 41.96, "elapsed_time": "0:48:36", "remaining_time": "1:07:13"} -{"current_steps": 685, "total_steps": 1630, "loss": 0.0735, "lr": 3.124524510539875e-06, "epoch": 4.2024539877300615, "percentage": 42.02, "elapsed_time": "0:48:40", "remaining_time": "1:07:08"} -{"current_steps": 686, "total_steps": 1630, "loss": 0.1806, "lr": 3.119857727357527e-06, "epoch": 4.208588957055214, "percentage": 42.09, "elapsed_time": "0:48:46", "remaining_time": "1:07:07"} -{"current_steps": 687, "total_steps": 1630, "loss": 0.1811, "lr": 3.1151886415861993e-06, "epoch": 4.214723926380368, "percentage": 42.15, "elapsed_time": "0:48:48", "remaining_time": "1:06:59"} -{"current_steps": 688, "total_steps": 1630, "loss": 0.1634, "lr": 3.1105172705701708e-06, "epoch": 4.220858895705521, "percentage": 42.21, "elapsed_time": "0:48:49", "remaining_time": "1:06:51"} -{"current_steps": 689, "total_steps": 1630, "loss": 0.1625, "lr": 3.1058436316622103e-06, "epoch": 4.226993865030675, "percentage": 42.27, "elapsed_time": "0:48:50", "remaining_time": "1:06:42"} -{"current_steps": 690, "total_steps": 1630, "loss": 0.1791, "lr": 3.1011677422235093e-06, "epoch": 4.233128834355828, "percentage": 42.33, "elapsed_time": "0:48:52", "remaining_time": "1:06:35"} -{"current_steps": 691, "total_steps": 1630, "loss": 0.2233, "lr": 3.0964896196236217e-06, "epoch": 4.2392638036809815, "percentage": 42.39, "elapsed_time": "0:48:55", "remaining_time": "1:06:29"} -{"current_steps": 692, "total_steps": 1630, "loss": 0.1142, "lr": 3.0918092812403954e-06, "epoch": 4.245398773006135, "percentage": 42.45, "elapsed_time": "0:48:59", "remaining_time": "1:06:23"} -{"current_steps": 693, "total_steps": 1630, "loss": 0.096, "lr": 3.0871267444599098e-06, "epoch": 4.251533742331288, "percentage": 42.52, "elapsed_time": "0:49:01", "remaining_time": "1:06:17"} -{"current_steps": 694, "total_steps": 1630, "loss": 0.2749, "lr": 3.0824420266764093e-06, "epoch": 4.257668711656442, "percentage": 42.58, "elapsed_time": "0:49:06", "remaining_time": "1:06:13"} -{"current_steps": 695, "total_steps": 1630, "loss": 0.2504, "lr": 3.077755145292243e-06, "epoch": 4.263803680981595, "percentage": 42.64, "elapsed_time": "0:49:09", "remaining_time": "1:06:08"} -{"current_steps": 696, "total_steps": 1630, "loss": 0.1324, "lr": 3.0730661177177957e-06, "epoch": 4.269938650306749, "percentage": 42.7, "elapsed_time": "0:49:12", "remaining_time": "1:06:02"} -{"current_steps": 697, "total_steps": 1630, "loss": 0.0691, "lr": 3.0683749613714238e-06, "epoch": 4.276073619631902, "percentage": 42.76, "elapsed_time": "0:49:15", "remaining_time": "1:05:55"} -{"current_steps": 698, "total_steps": 1630, "loss": 0.1026, "lr": 3.063681693679391e-06, "epoch": 4.282208588957055, "percentage": 42.82, "elapsed_time": "0:49:18", "remaining_time": "1:05:49"} -{"current_steps": 699, "total_steps": 1630, "loss": 0.2646, "lr": 3.0589863320758063e-06, "epoch": 4.288343558282208, "percentage": 42.88, "elapsed_time": "0:49:19", "remaining_time": "1:05:41"} -{"current_steps": 700, "total_steps": 1630, "loss": 0.1711, "lr": 3.0542888940025562e-06, "epoch": 4.294478527607362, "percentage": 42.94, "elapsed_time": "0:49:21", "remaining_time": "1:05:34"} -{"current_steps": 701, "total_steps": 1630, "loss": 0.0589, "lr": 3.0495893969092395e-06, "epoch": 4.300613496932515, "percentage": 43.01, "elapsed_time": "0:49:23", "remaining_time": "1:05:27"} -{"current_steps": 702, "total_steps": 1630, "loss": 0.2244, "lr": 3.044887858253105e-06, "epoch": 4.306748466257669, "percentage": 43.07, "elapsed_time": "0:49:29", "remaining_time": "1:05:25"} -{"current_steps": 703, "total_steps": 1630, "loss": 0.1506, "lr": 3.040184295498984e-06, "epoch": 4.3128834355828225, "percentage": 43.13, "elapsed_time": "0:49:32", "remaining_time": "1:05:19"} -{"current_steps": 704, "total_steps": 1630, "loss": 0.2343, "lr": 3.035478726119228e-06, "epoch": 4.319018404907975, "percentage": 43.19, "elapsed_time": "0:49:35", "remaining_time": "1:05:13"} -{"current_steps": 705, "total_steps": 1630, "loss": 0.0518, "lr": 3.0307711675936426e-06, "epoch": 4.325153374233129, "percentage": 43.25, "elapsed_time": "0:49:37", "remaining_time": "1:05:06"} -{"current_steps": 706, "total_steps": 1630, "loss": 0.2363, "lr": 3.0260616374094208e-06, "epoch": 4.331288343558282, "percentage": 43.31, "elapsed_time": "0:49:39", "remaining_time": "1:04:58"} -{"current_steps": 707, "total_steps": 1630, "loss": 0.0848, "lr": 3.0213501530610807e-06, "epoch": 4.337423312883436, "percentage": 43.37, "elapsed_time": "0:49:43", "remaining_time": "1:04:55"} -{"current_steps": 708, "total_steps": 1630, "loss": 0.149, "lr": 3.0166367320504005e-06, "epoch": 4.343558282208589, "percentage": 43.44, "elapsed_time": "0:49:46", "remaining_time": "1:04:48"} -{"current_steps": 709, "total_steps": 1630, "loss": 0.1133, "lr": 3.0119213918863515e-06, "epoch": 4.3496932515337425, "percentage": 43.5, "elapsed_time": "0:49:49", "remaining_time": "1:04:43"} -{"current_steps": 710, "total_steps": 1630, "loss": 0.1358, "lr": 3.0072041500850343e-06, "epoch": 4.355828220858895, "percentage": 43.56, "elapsed_time": "0:49:52", "remaining_time": "1:04:37"} -{"current_steps": 711, "total_steps": 1630, "loss": 0.0706, "lr": 3.0024850241696128e-06, "epoch": 4.361963190184049, "percentage": 43.62, "elapsed_time": "0:49:55", "remaining_time": "1:04:31"} -{"current_steps": 712, "total_steps": 1630, "loss": 0.1977, "lr": 2.9977640316702512e-06, "epoch": 4.368098159509202, "percentage": 43.68, "elapsed_time": "0:50:02", "remaining_time": "1:04:30"} -{"current_steps": 713, "total_steps": 1630, "loss": 0.2622, "lr": 2.993041190124047e-06, "epoch": 4.374233128834356, "percentage": 43.74, "elapsed_time": "0:50:07", "remaining_time": "1:04:28"} -{"current_steps": 714, "total_steps": 1630, "loss": 0.1487, "lr": 2.9883165170749657e-06, "epoch": 4.38036809815951, "percentage": 43.8, "elapsed_time": "0:50:12", "remaining_time": "1:04:24"} -{"current_steps": 715, "total_steps": 1630, "loss": 0.0822, "lr": 2.9835900300737763e-06, "epoch": 4.386503067484663, "percentage": 43.87, "elapsed_time": "0:50:14", "remaining_time": "1:04:17"} -{"current_steps": 716, "total_steps": 1630, "loss": 0.3668, "lr": 2.9788617466779884e-06, "epoch": 4.392638036809816, "percentage": 43.93, "elapsed_time": "0:50:16", "remaining_time": "1:04:10"} -{"current_steps": 717, "total_steps": 1630, "loss": 0.2432, "lr": 2.974131684451781e-06, "epoch": 4.398773006134969, "percentage": 43.99, "elapsed_time": "0:50:17", "remaining_time": "1:04:02"} -{"current_steps": 718, "total_steps": 1630, "loss": 0.0689, "lr": 2.9693998609659443e-06, "epoch": 4.404907975460123, "percentage": 44.05, "elapsed_time": "0:50:20", "remaining_time": "1:03:56"} -{"current_steps": 719, "total_steps": 1630, "loss": 0.1897, "lr": 2.9646662937978082e-06, "epoch": 4.411042944785276, "percentage": 44.11, "elapsed_time": "0:50:22", "remaining_time": "1:03:49"} -{"current_steps": 720, "total_steps": 1630, "loss": 0.0457, "lr": 2.9599310005311824e-06, "epoch": 4.41717791411043, "percentage": 44.17, "elapsed_time": "0:50:25", "remaining_time": "1:03:43"} -{"current_steps": 721, "total_steps": 1630, "loss": 0.2307, "lr": 2.9551939987562866e-06, "epoch": 4.423312883435583, "percentage": 44.23, "elapsed_time": "0:50:26", "remaining_time": "1:03:35"} -{"current_steps": 722, "total_steps": 1630, "loss": 0.0637, "lr": 2.950455306069688e-06, "epoch": 4.429447852760736, "percentage": 44.29, "elapsed_time": "0:50:28", "remaining_time": "1:03:28"} -{"current_steps": 723, "total_steps": 1630, "loss": 0.1924, "lr": 2.9457149400742357e-06, "epoch": 4.435582822085889, "percentage": 44.36, "elapsed_time": "0:50:29", "remaining_time": "1:03:21"} -{"current_steps": 724, "total_steps": 1630, "loss": 0.1275, "lr": 2.940972918378993e-06, "epoch": 4.441717791411043, "percentage": 44.42, "elapsed_time": "0:50:32", "remaining_time": "1:03:14"} -{"current_steps": 725, "total_steps": 1630, "loss": 0.123, "lr": 2.936229258599174e-06, "epoch": 4.447852760736196, "percentage": 44.48, "elapsed_time": "0:50:33", "remaining_time": "1:03:06"} -{"current_steps": 726, "total_steps": 1630, "loss": 0.0555, "lr": 2.93148397835608e-06, "epoch": 4.45398773006135, "percentage": 44.54, "elapsed_time": "0:50:34", "remaining_time": "1:02:59"} -{"current_steps": 727, "total_steps": 1630, "loss": 0.0991, "lr": 2.926737095277029e-06, "epoch": 4.460122699386503, "percentage": 44.6, "elapsed_time": "0:50:36", "remaining_time": "1:02:51"} -{"current_steps": 728, "total_steps": 1630, "loss": 0.0628, "lr": 2.921988626995295e-06, "epoch": 4.466257668711656, "percentage": 44.66, "elapsed_time": "0:50:38", "remaining_time": "1:02:45"} -{"current_steps": 729, "total_steps": 1630, "loss": 0.2333, "lr": 2.9172385911500385e-06, "epoch": 4.47239263803681, "percentage": 44.72, "elapsed_time": "0:50:40", "remaining_time": "1:02:38"} -{"current_steps": 730, "total_steps": 1630, "loss": 0.1317, "lr": 2.9124870053862447e-06, "epoch": 4.478527607361963, "percentage": 44.79, "elapsed_time": "0:50:43", "remaining_time": "1:02:32"} -{"current_steps": 731, "total_steps": 1630, "loss": 0.2285, "lr": 2.907733887354657e-06, "epoch": 4.484662576687117, "percentage": 44.85, "elapsed_time": "0:50:46", "remaining_time": "1:02:26"} -{"current_steps": 732, "total_steps": 1630, "loss": 0.096, "lr": 2.9029792547117088e-06, "epoch": 4.49079754601227, "percentage": 44.91, "elapsed_time": "0:50:49", "remaining_time": "1:02:20"} -{"current_steps": 733, "total_steps": 1630, "loss": 0.1505, "lr": 2.898223125119461e-06, "epoch": 4.4969325153374236, "percentage": 44.97, "elapsed_time": "0:50:51", "remaining_time": "1:02:14"} -{"current_steps": 734, "total_steps": 1630, "loss": 0.0327, "lr": 2.893465516245534e-06, "epoch": 4.5030674846625764, "percentage": 45.03, "elapsed_time": "0:50:52", "remaining_time": "1:02:06"} -{"current_steps": 735, "total_steps": 1630, "loss": 0.0743, "lr": 2.8887064457630453e-06, "epoch": 4.50920245398773, "percentage": 45.09, "elapsed_time": "0:50:55", "remaining_time": "1:02:00"} -{"current_steps": 736, "total_steps": 1630, "loss": 0.1768, "lr": 2.8839459313505407e-06, "epoch": 4.515337423312883, "percentage": 45.15, "elapsed_time": "0:50:58", "remaining_time": "1:01:55"} -{"current_steps": 737, "total_steps": 1630, "loss": 0.1598, "lr": 2.879183990691929e-06, "epoch": 4.521472392638037, "percentage": 45.21, "elapsed_time": "0:51:01", "remaining_time": "1:01:48"} -{"current_steps": 738, "total_steps": 1630, "loss": 0.0829, "lr": 2.8744206414764185e-06, "epoch": 4.52760736196319, "percentage": 45.28, "elapsed_time": "0:51:04", "remaining_time": "1:01:43"} -{"current_steps": 739, "total_steps": 1630, "loss": 0.1169, "lr": 2.8696559013984488e-06, "epoch": 4.533742331288344, "percentage": 45.34, "elapsed_time": "0:51:05", "remaining_time": "1:01:36"} -{"current_steps": 740, "total_steps": 1630, "loss": 0.0962, "lr": 2.8648897881576274e-06, "epoch": 4.539877300613497, "percentage": 45.4, "elapsed_time": "0:51:10", "remaining_time": "1:01:32"} -{"current_steps": 741, "total_steps": 1630, "loss": 0.1204, "lr": 2.8601223194586613e-06, "epoch": 4.54601226993865, "percentage": 45.46, "elapsed_time": "0:51:15", "remaining_time": "1:01:30"} -{"current_steps": 742, "total_steps": 1630, "loss": 0.0685, "lr": 2.8553535130112935e-06, "epoch": 4.552147239263804, "percentage": 45.52, "elapsed_time": "0:51:16", "remaining_time": "1:01:22"} -{"current_steps": 743, "total_steps": 1630, "loss": 0.0692, "lr": 2.850583386530235e-06, "epoch": 4.558282208588957, "percentage": 45.58, "elapsed_time": "0:51:19", "remaining_time": "1:01:16"} -{"current_steps": 744, "total_steps": 1630, "loss": 0.2128, "lr": 2.8458119577351035e-06, "epoch": 4.564417177914111, "percentage": 45.64, "elapsed_time": "0:51:25", "remaining_time": "1:01:14"} -{"current_steps": 745, "total_steps": 1630, "loss": 0.2409, "lr": 2.841039244350351e-06, "epoch": 4.570552147239264, "percentage": 45.71, "elapsed_time": "0:51:32", "remaining_time": "1:01:13"} -{"current_steps": 746, "total_steps": 1630, "loss": 0.1878, "lr": 2.8362652641052024e-06, "epoch": 4.576687116564417, "percentage": 45.77, "elapsed_time": "0:51:33", "remaining_time": "1:01:06"} -{"current_steps": 747, "total_steps": 1630, "loss": 0.1303, "lr": 2.83149003473359e-06, "epoch": 4.58282208588957, "percentage": 45.83, "elapsed_time": "0:51:39", "remaining_time": "1:01:04"} -{"current_steps": 748, "total_steps": 1630, "loss": 0.0577, "lr": 2.8267135739740836e-06, "epoch": 4.588957055214724, "percentage": 45.89, "elapsed_time": "0:51:43", "remaining_time": "1:00:59"} -{"current_steps": 749, "total_steps": 1630, "loss": 0.2329, "lr": 2.8219358995698307e-06, "epoch": 4.595092024539877, "percentage": 45.95, "elapsed_time": "0:51:47", "remaining_time": "1:00:55"} -{"current_steps": 750, "total_steps": 1630, "loss": 0.1329, "lr": 2.8171570292684846e-06, "epoch": 4.601226993865031, "percentage": 46.01, "elapsed_time": "0:51:50", "remaining_time": "1:00:50"} -{"current_steps": 751, "total_steps": 1630, "loss": 0.1512, "lr": 2.8123769808221407e-06, "epoch": 4.6073619631901845, "percentage": 46.07, "elapsed_time": "0:51:56", "remaining_time": "1:00:48"} -{"current_steps": 752, "total_steps": 1630, "loss": 0.1267, "lr": 2.8075957719872724e-06, "epoch": 4.613496932515337, "percentage": 46.13, "elapsed_time": "0:51:58", "remaining_time": "1:00:41"} -{"current_steps": 753, "total_steps": 1630, "loss": 0.147, "lr": 2.8028134205246633e-06, "epoch": 4.61963190184049, "percentage": 46.2, "elapsed_time": "0:52:00", "remaining_time": "1:00:34"} -{"current_steps": 754, "total_steps": 1630, "loss": 0.0947, "lr": 2.7980299441993415e-06, "epoch": 4.625766871165644, "percentage": 46.26, "elapsed_time": "0:52:01", "remaining_time": "1:00:27"} -{"current_steps": 755, "total_steps": 1630, "loss": 0.1498, "lr": 2.793245360780512e-06, "epoch": 4.631901840490798, "percentage": 46.32, "elapsed_time": "0:52:03", "remaining_time": "1:00:19"} -{"current_steps": 756, "total_steps": 1630, "loss": 0.2504, "lr": 2.788459688041495e-06, "epoch": 4.638036809815951, "percentage": 46.38, "elapsed_time": "0:52:06", "remaining_time": "1:00:14"} -{"current_steps": 757, "total_steps": 1630, "loss": 0.2091, "lr": 2.783672943759655e-06, "epoch": 4.644171779141105, "percentage": 46.44, "elapsed_time": "0:52:13", "remaining_time": "1:00:13"} -{"current_steps": 758, "total_steps": 1630, "loss": 0.2045, "lr": 2.778885145716339e-06, "epoch": 4.6503067484662575, "percentage": 46.5, "elapsed_time": "0:52:15", "remaining_time": "1:00:06"} -{"current_steps": 759, "total_steps": 1630, "loss": 0.1416, "lr": 2.7740963116968063e-06, "epoch": 4.656441717791411, "percentage": 46.56, "elapsed_time": "0:52:20", "remaining_time": "1:00:04"} -{"current_steps": 760, "total_steps": 1630, "loss": 0.0455, "lr": 2.7693064594901646e-06, "epoch": 4.662576687116564, "percentage": 46.63, "elapsed_time": "0:52:22", "remaining_time": "0:59:56"} -{"current_steps": 761, "total_steps": 1630, "loss": 0.1496, "lr": 2.7645156068893075e-06, "epoch": 4.668711656441718, "percentage": 46.69, "elapsed_time": "0:52:26", "remaining_time": "0:59:53"} -{"current_steps": 762, "total_steps": 1630, "loss": 0.2061, "lr": 2.759723771690839e-06, "epoch": 4.674846625766871, "percentage": 46.75, "elapsed_time": "0:52:29", "remaining_time": "0:59:47"} -{"current_steps": 763, "total_steps": 1630, "loss": 0.1017, "lr": 2.754930971695019e-06, "epoch": 4.680981595092025, "percentage": 46.81, "elapsed_time": "0:52:30", "remaining_time": "0:59:40"} -{"current_steps": 764, "total_steps": 1630, "loss": 0.1979, "lr": 2.750137224705687e-06, "epoch": 4.6871165644171775, "percentage": 46.87, "elapsed_time": "0:52:34", "remaining_time": "0:59:35"} -{"current_steps": 765, "total_steps": 1630, "loss": 0.1667, "lr": 2.745342548530202e-06, "epoch": 4.693251533742331, "percentage": 46.93, "elapsed_time": "0:52:35", "remaining_time": "0:59:28"} -{"current_steps": 766, "total_steps": 1630, "loss": 0.0346, "lr": 2.7405469609793746e-06, "epoch": 4.699386503067485, "percentage": 46.99, "elapsed_time": "0:52:37", "remaining_time": "0:59:21"} -{"current_steps": 767, "total_steps": 1630, "loss": 0.0596, "lr": 2.7357504798674004e-06, "epoch": 4.705521472392638, "percentage": 47.06, "elapsed_time": "0:52:41", "remaining_time": "0:59:17"} -{"current_steps": 768, "total_steps": 1630, "loss": 0.0384, "lr": 2.730953123011796e-06, "epoch": 4.711656441717792, "percentage": 47.12, "elapsed_time": "0:52:43", "remaining_time": "0:59:10"} -{"current_steps": 769, "total_steps": 1630, "loss": 0.0558, "lr": 2.726154908233328e-06, "epoch": 4.717791411042945, "percentage": 47.18, "elapsed_time": "0:52:46", "remaining_time": "0:59:05"} -{"current_steps": 770, "total_steps": 1630, "loss": 0.2272, "lr": 2.721355853355953e-06, "epoch": 4.723926380368098, "percentage": 47.24, "elapsed_time": "0:52:51", "remaining_time": "0:59:02"} -{"current_steps": 771, "total_steps": 1630, "loss": 0.074, "lr": 2.716555976206748e-06, "epoch": 4.730061349693251, "percentage": 47.3, "elapsed_time": "0:52:54", "remaining_time": "0:58:57"} -{"current_steps": 772, "total_steps": 1630, "loss": 0.1034, "lr": 2.7117552946158415e-06, "epoch": 4.736196319018405, "percentage": 47.36, "elapsed_time": "0:52:55", "remaining_time": "0:58:49"} -{"current_steps": 773, "total_steps": 1630, "loss": 0.1199, "lr": 2.706953826416353e-06, "epoch": 4.742331288343558, "percentage": 47.42, "elapsed_time": "0:52:58", "remaining_time": "0:58:43"} -{"current_steps": 774, "total_steps": 1630, "loss": 0.0467, "lr": 2.702151589444324e-06, "epoch": 4.748466257668712, "percentage": 47.48, "elapsed_time": "0:53:00", "remaining_time": "0:58:37"} -{"current_steps": 775, "total_steps": 1630, "loss": 0.143, "lr": 2.6973486015386507e-06, "epoch": 4.754601226993865, "percentage": 47.55, "elapsed_time": "0:53:02", "remaining_time": "0:58:30"} -{"current_steps": 776, "total_steps": 1630, "loss": 0.3594, "lr": 2.6925448805410197e-06, "epoch": 4.7607361963190185, "percentage": 47.61, "elapsed_time": "0:53:05", "remaining_time": "0:58:25"} -{"current_steps": 777, "total_steps": 1630, "loss": 0.1397, "lr": 2.6877404442958393e-06, "epoch": 4.766871165644172, "percentage": 47.67, "elapsed_time": "0:53:06", "remaining_time": "0:58:18"} -{"current_steps": 778, "total_steps": 1630, "loss": 0.054, "lr": 2.682935310650177e-06, "epoch": 4.773006134969325, "percentage": 47.73, "elapsed_time": "0:53:10", "remaining_time": "0:58:14"} -{"current_steps": 779, "total_steps": 1630, "loss": 0.1284, "lr": 2.6781294974536886e-06, "epoch": 4.779141104294479, "percentage": 47.79, "elapsed_time": "0:53:12", "remaining_time": "0:58:07"} -{"current_steps": 780, "total_steps": 1630, "loss": 0.1441, "lr": 2.673323022558557e-06, "epoch": 4.785276073619632, "percentage": 47.85, "elapsed_time": "0:53:18", "remaining_time": "0:58:05"} -{"current_steps": 781, "total_steps": 1630, "loss": 0.2859, "lr": 2.6685159038194202e-06, "epoch": 4.791411042944786, "percentage": 47.91, "elapsed_time": "0:53:20", "remaining_time": "0:57:59"} -{"current_steps": 782, "total_steps": 1630, "loss": 0.1524, "lr": 2.6637081590933096e-06, "epoch": 4.7975460122699385, "percentage": 47.98, "elapsed_time": "0:53:22", "remaining_time": "0:57:52"} -{"current_steps": 783, "total_steps": 1630, "loss": 0.0338, "lr": 2.6588998062395803e-06, "epoch": 4.803680981595092, "percentage": 48.04, "elapsed_time": "0:53:24", "remaining_time": "0:57:46"} -{"current_steps": 784, "total_steps": 1630, "loss": 0.0755, "lr": 2.6540908631198498e-06, "epoch": 4.809815950920245, "percentage": 48.1, "elapsed_time": "0:53:25", "remaining_time": "0:57:39"} -{"current_steps": 785, "total_steps": 1630, "loss": 0.0631, "lr": 2.6492813475979243e-06, "epoch": 4.815950920245399, "percentage": 48.16, "elapsed_time": "0:53:28", "remaining_time": "0:57:33"} -{"current_steps": 786, "total_steps": 1630, "loss": 0.0853, "lr": 2.6444712775397397e-06, "epoch": 4.822085889570552, "percentage": 48.22, "elapsed_time": "0:53:30", "remaining_time": "0:57:26"} -{"current_steps": 787, "total_steps": 1630, "loss": 0.1895, "lr": 2.639660670813288e-06, "epoch": 4.828220858895706, "percentage": 48.28, "elapsed_time": "0:53:34", "remaining_time": "0:57:22"} -{"current_steps": 788, "total_steps": 1630, "loss": 0.1745, "lr": 2.6348495452885598e-06, "epoch": 4.8343558282208585, "percentage": 48.34, "elapsed_time": "0:53:36", "remaining_time": "0:57:16"} -{"current_steps": 789, "total_steps": 1630, "loss": 0.0846, "lr": 2.630037918837468e-06, "epoch": 4.840490797546012, "percentage": 48.4, "elapsed_time": "0:53:38", "remaining_time": "0:57:10"} -{"current_steps": 790, "total_steps": 1630, "loss": 0.0808, "lr": 2.6252258093337892e-06, "epoch": 4.846625766871165, "percentage": 48.47, "elapsed_time": "0:53:39", "remaining_time": "0:57:03"} -{"current_steps": 791, "total_steps": 1630, "loss": 0.2054, "lr": 2.6204132346530936e-06, "epoch": 4.852760736196319, "percentage": 48.53, "elapsed_time": "0:53:41", "remaining_time": "0:56:56"} -{"current_steps": 792, "total_steps": 1630, "loss": 0.1679, "lr": 2.6156002126726788e-06, "epoch": 4.858895705521473, "percentage": 48.59, "elapsed_time": "0:53:42", "remaining_time": "0:56:49"} -{"current_steps": 793, "total_steps": 1630, "loss": 0.0534, "lr": 2.6107867612715043e-06, "epoch": 4.865030674846626, "percentage": 48.65, "elapsed_time": "0:53:45", "remaining_time": "0:56:44"} -{"current_steps": 794, "total_steps": 1630, "loss": 0.0899, "lr": 2.6059728983301267e-06, "epoch": 4.871165644171779, "percentage": 48.71, "elapsed_time": "0:53:47", "remaining_time": "0:56:37"} -{"current_steps": 795, "total_steps": 1630, "loss": 0.0596, "lr": 2.601158641730629e-06, "epoch": 4.877300613496932, "percentage": 48.77, "elapsed_time": "0:53:51", "remaining_time": "0:56:33"} -{"current_steps": 796, "total_steps": 1630, "loss": 0.3858, "lr": 2.5963440093565567e-06, "epoch": 4.883435582822086, "percentage": 48.83, "elapsed_time": "0:53:52", "remaining_time": "0:56:26"} -{"current_steps": 797, "total_steps": 1630, "loss": 0.12, "lr": 2.5915290190928518e-06, "epoch": 4.889570552147239, "percentage": 48.9, "elapsed_time": "0:53:55", "remaining_time": "0:56:22"} -{"current_steps": 798, "total_steps": 1630, "loss": 0.1278, "lr": 2.586713688825786e-06, "epoch": 4.895705521472393, "percentage": 48.96, "elapsed_time": "0:53:58", "remaining_time": "0:56:16"} -{"current_steps": 799, "total_steps": 1630, "loss": 0.0847, "lr": 2.5818980364428935e-06, "epoch": 4.901840490797546, "percentage": 49.02, "elapsed_time": "0:54:01", "remaining_time": "0:56:11"} -{"current_steps": 800, "total_steps": 1630, "loss": 0.1718, "lr": 2.5770820798329055e-06, "epoch": 4.9079754601226995, "percentage": 49.08, "elapsed_time": "0:54:02", "remaining_time": "0:56:03"} -{"current_steps": 801, "total_steps": 1630, "loss": 0.0895, "lr": 2.572265836885682e-06, "epoch": 4.914110429447852, "percentage": 49.14, "elapsed_time": "0:54:05", "remaining_time": "0:55:59"} -{"current_steps": 802, "total_steps": 1630, "loss": 0.0652, "lr": 2.567449325492149e-06, "epoch": 4.920245398773006, "percentage": 49.2, "elapsed_time": "0:54:07", "remaining_time": "0:55:52"} -{"current_steps": 803, "total_steps": 1630, "loss": 0.0877, "lr": 2.5626325635442283e-06, "epoch": 4.92638036809816, "percentage": 49.26, "elapsed_time": "0:54:11", "remaining_time": "0:55:48"} -{"current_steps": 804, "total_steps": 1630, "loss": 0.2028, "lr": 2.5578155689347716e-06, "epoch": 4.932515337423313, "percentage": 49.33, "elapsed_time": "0:54:19", "remaining_time": "0:55:48"} -{"current_steps": 805, "total_steps": 1630, "loss": 0.031, "lr": 2.5529983595574964e-06, "epoch": 4.938650306748467, "percentage": 49.39, "elapsed_time": "0:54:20", "remaining_time": "0:55:41"} -{"current_steps": 806, "total_steps": 1630, "loss": 0.0415, "lr": 2.548180953306918e-06, "epoch": 4.9447852760736195, "percentage": 49.45, "elapsed_time": "0:54:23", "remaining_time": "0:55:36"} -{"current_steps": 807, "total_steps": 1630, "loss": 0.1188, "lr": 2.5433633680782817e-06, "epoch": 4.950920245398773, "percentage": 49.51, "elapsed_time": "0:54:25", "remaining_time": "0:55:30"} -{"current_steps": 808, "total_steps": 1630, "loss": 0.1703, "lr": 2.538545621767498e-06, "epoch": 4.957055214723926, "percentage": 49.57, "elapsed_time": "0:54:27", "remaining_time": "0:55:23"} -{"current_steps": 809, "total_steps": 1630, "loss": 0.1455, "lr": 2.533727732271077e-06, "epoch": 4.96319018404908, "percentage": 49.63, "elapsed_time": "0:54:29", "remaining_time": "0:55:18"} -{"current_steps": 810, "total_steps": 1630, "loss": 0.0617, "lr": 2.5289097174860593e-06, "epoch": 4.969325153374233, "percentage": 49.69, "elapsed_time": "0:54:31", "remaining_time": "0:55:11"} -{"current_steps": 811, "total_steps": 1630, "loss": 0.1173, "lr": 2.524091595309952e-06, "epoch": 4.975460122699387, "percentage": 49.75, "elapsed_time": "0:54:34", "remaining_time": "0:55:07"} -{"current_steps": 812, "total_steps": 1630, "loss": 0.0538, "lr": 2.519273383640661e-06, "epoch": 4.9815950920245395, "percentage": 49.82, "elapsed_time": "0:54:36", "remaining_time": "0:55:01"} -{"current_steps": 813, "total_steps": 1630, "loss": 0.211, "lr": 2.5144551003764227e-06, "epoch": 4.987730061349693, "percentage": 49.88, "elapsed_time": "0:54:40", "remaining_time": "0:54:56"} -{"current_steps": 814, "total_steps": 1630, "loss": 0.0944, "lr": 2.509636763415742e-06, "epoch": 4.993865030674847, "percentage": 49.94, "elapsed_time": "0:54:44", "remaining_time": "0:54:52"} -{"current_steps": 815, "total_steps": 1630, "loss": 0.098, "lr": 2.5048183906573227e-06, "epoch": 5.0, "percentage": 50.0, "elapsed_time": "0:54:46", "remaining_time": "0:54:46"} -{"current_steps": 816, "total_steps": 1630, "loss": 0.1102, "lr": 2.5e-06, "epoch": 5.006134969325154, "percentage": 50.06, "elapsed_time": "0:58:24", "remaining_time": "0:58:15"} -{"current_steps": 817, "total_steps": 1630, "loss": 0.0712, "lr": 2.495181609342678e-06, "epoch": 5.012269938650307, "percentage": 50.12, "elapsed_time": "0:58:26", "remaining_time": "0:58:09"} -{"current_steps": 818, "total_steps": 1630, "loss": 0.0414, "lr": 2.4903632365842587e-06, "epoch": 5.0184049079754605, "percentage": 50.18, "elapsed_time": "0:58:27", "remaining_time": "0:58:02"} -{"current_steps": 819, "total_steps": 1630, "loss": 0.0894, "lr": 2.4855448996235777e-06, "epoch": 5.024539877300613, "percentage": 50.25, "elapsed_time": "0:58:29", "remaining_time": "0:57:55"} -{"current_steps": 820, "total_steps": 1630, "loss": 0.0796, "lr": 2.48072661635934e-06, "epoch": 5.030674846625767, "percentage": 50.31, "elapsed_time": "0:58:35", "remaining_time": "0:57:52"} -{"current_steps": 821, "total_steps": 1630, "loss": 0.0349, "lr": 2.475908404690049e-06, "epoch": 5.03680981595092, "percentage": 50.37, "elapsed_time": "0:58:39", "remaining_time": "0:57:47"} -{"current_steps": 822, "total_steps": 1630, "loss": 0.2529, "lr": 2.4710902825139415e-06, "epoch": 5.042944785276074, "percentage": 50.43, "elapsed_time": "0:58:42", "remaining_time": "0:57:42"} -{"current_steps": 823, "total_steps": 1630, "loss": 0.1405, "lr": 2.466272267728924e-06, "epoch": 5.049079754601227, "percentage": 50.49, "elapsed_time": "0:58:45", "remaining_time": "0:57:37"} -{"current_steps": 824, "total_steps": 1630, "loss": 0.0408, "lr": 2.461454378232503e-06, "epoch": 5.0552147239263805, "percentage": 50.55, "elapsed_time": "0:58:50", "remaining_time": "0:57:33"} -{"current_steps": 825, "total_steps": 1630, "loss": 0.0338, "lr": 2.4566366319217196e-06, "epoch": 5.061349693251533, "percentage": 50.61, "elapsed_time": "0:58:53", "remaining_time": "0:57:27"} -{"current_steps": 826, "total_steps": 1630, "loss": 0.06, "lr": 2.4518190466930837e-06, "epoch": 5.067484662576687, "percentage": 50.67, "elapsed_time": "0:58:54", "remaining_time": "0:57:20"} -{"current_steps": 827, "total_steps": 1630, "loss": 0.1184, "lr": 2.4470016404425045e-06, "epoch": 5.07361963190184, "percentage": 50.74, "elapsed_time": "0:59:00", "remaining_time": "0:57:17"} -{"current_steps": 828, "total_steps": 1630, "loss": 0.1369, "lr": 2.4421844310652296e-06, "epoch": 5.079754601226994, "percentage": 50.8, "elapsed_time": "0:59:06", "remaining_time": "0:57:14"} -{"current_steps": 829, "total_steps": 1630, "loss": 0.1166, "lr": 2.437367436455773e-06, "epoch": 5.085889570552148, "percentage": 50.86, "elapsed_time": "0:59:09", "remaining_time": "0:57:09"} -{"current_steps": 830, "total_steps": 1630, "loss": 0.1214, "lr": 2.4325506745078524e-06, "epoch": 5.0920245398773005, "percentage": 50.92, "elapsed_time": "0:59:16", "remaining_time": "0:57:08"} -{"current_steps": 831, "total_steps": 1630, "loss": 0.0454, "lr": 2.427734163114319e-06, "epoch": 5.098159509202454, "percentage": 50.98, "elapsed_time": "0:59:17", "remaining_time": "0:57:00"} -{"current_steps": 832, "total_steps": 1630, "loss": 0.0431, "lr": 2.4229179201670954e-06, "epoch": 5.104294478527607, "percentage": 51.04, "elapsed_time": "0:59:19", "remaining_time": "0:56:54"} -{"current_steps": 833, "total_steps": 1630, "loss": 0.0347, "lr": 2.418101963557107e-06, "epoch": 5.110429447852761, "percentage": 51.1, "elapsed_time": "0:59:21", "remaining_time": "0:56:47"} -{"current_steps": 834, "total_steps": 1630, "loss": 0.1555, "lr": 2.413286311174214e-06, "epoch": 5.116564417177914, "percentage": 51.17, "elapsed_time": "0:59:24", "remaining_time": "0:56:42"} -{"current_steps": 835, "total_steps": 1630, "loss": 0.035, "lr": 2.4084709809071487e-06, "epoch": 5.122699386503068, "percentage": 51.23, "elapsed_time": "0:59:27", "remaining_time": "0:56:36"} -{"current_steps": 836, "total_steps": 1630, "loss": 0.0798, "lr": 2.403655990643444e-06, "epoch": 5.128834355828221, "percentage": 51.29, "elapsed_time": "0:59:29", "remaining_time": "0:56:30"} -{"current_steps": 837, "total_steps": 1630, "loss": 0.0178, "lr": 2.398841358269371e-06, "epoch": 5.134969325153374, "percentage": 51.35, "elapsed_time": "0:59:30", "remaining_time": "0:56:22"} -{"current_steps": 838, "total_steps": 1630, "loss": 0.0447, "lr": 2.3940271016698733e-06, "epoch": 5.141104294478527, "percentage": 51.41, "elapsed_time": "0:59:32", "remaining_time": "0:56:16"} -{"current_steps": 839, "total_steps": 1630, "loss": 0.1626, "lr": 2.3892132387284956e-06, "epoch": 5.147239263803681, "percentage": 51.47, "elapsed_time": "0:59:37", "remaining_time": "0:56:13"} -{"current_steps": 840, "total_steps": 1630, "loss": 0.0914, "lr": 2.384399787327322e-06, "epoch": 5.153374233128835, "percentage": 51.53, "elapsed_time": "0:59:39", "remaining_time": "0:56:06"} -{"current_steps": 841, "total_steps": 1630, "loss": 0.0784, "lr": 2.3795867653469072e-06, "epoch": 5.159509202453988, "percentage": 51.6, "elapsed_time": "0:59:40", "remaining_time": "0:55:59"} -{"current_steps": 842, "total_steps": 1630, "loss": 0.0216, "lr": 2.374774190666211e-06, "epoch": 5.1656441717791415, "percentage": 51.66, "elapsed_time": "0:59:43", "remaining_time": "0:55:53"} -{"current_steps": 843, "total_steps": 1630, "loss": 0.0516, "lr": 2.3699620811625327e-06, "epoch": 5.171779141104294, "percentage": 51.72, "elapsed_time": "0:59:47", "remaining_time": "0:55:49"} -{"current_steps": 844, "total_steps": 1630, "loss": 0.0517, "lr": 2.365150454711441e-06, "epoch": 5.177914110429448, "percentage": 51.78, "elapsed_time": "0:59:48", "remaining_time": "0:55:42"} -{"current_steps": 845, "total_steps": 1630, "loss": 0.0264, "lr": 2.3603393291867122e-06, "epoch": 5.184049079754601, "percentage": 51.84, "elapsed_time": "0:59:52", "remaining_time": "0:55:37"} -{"current_steps": 846, "total_steps": 1630, "loss": 0.1079, "lr": 2.355528722460261e-06, "epoch": 5.190184049079755, "percentage": 51.9, "elapsed_time": "0:59:53", "remaining_time": "0:55:30"} -{"current_steps": 847, "total_steps": 1630, "loss": 0.0633, "lr": 2.350718652402076e-06, "epoch": 5.196319018404908, "percentage": 51.96, "elapsed_time": "0:59:57", "remaining_time": "0:55:25"} -{"current_steps": 848, "total_steps": 1630, "loss": 0.1013, "lr": 2.345909136880151e-06, "epoch": 5.2024539877300615, "percentage": 52.02, "elapsed_time": "0:59:59", "remaining_time": "0:55:19"} -{"current_steps": 849, "total_steps": 1630, "loss": 0.0199, "lr": 2.34110019376042e-06, "epoch": 5.208588957055214, "percentage": 52.09, "elapsed_time": "1:00:00", "remaining_time": "0:55:12"} -{"current_steps": 850, "total_steps": 1630, "loss": 0.0288, "lr": 2.336291840906691e-06, "epoch": 5.214723926380368, "percentage": 52.15, "elapsed_time": "1:00:03", "remaining_time": "0:55:06"} -{"current_steps": 851, "total_steps": 1630, "loss": 0.0142, "lr": 2.3314840961805806e-06, "epoch": 5.220858895705521, "percentage": 52.21, "elapsed_time": "1:00:04", "remaining_time": "0:54:59"} -{"current_steps": 852, "total_steps": 1630, "loss": 0.0911, "lr": 2.326676977441444e-06, "epoch": 5.226993865030675, "percentage": 52.27, "elapsed_time": "1:00:11", "remaining_time": "0:54:57"} -{"current_steps": 853, "total_steps": 1630, "loss": 0.0315, "lr": 2.3218705025463118e-06, "epoch": 5.233128834355828, "percentage": 52.33, "elapsed_time": "1:00:13", "remaining_time": "0:54:51"} -{"current_steps": 854, "total_steps": 1630, "loss": 0.1344, "lr": 2.3170646893498237e-06, "epoch": 5.2392638036809815, "percentage": 52.39, "elapsed_time": "1:00:18", "remaining_time": "0:54:48"} -{"current_steps": 855, "total_steps": 1630, "loss": 0.034, "lr": 2.312259555704161e-06, "epoch": 5.245398773006135, "percentage": 52.45, "elapsed_time": "1:00:20", "remaining_time": "0:54:41"} -{"current_steps": 856, "total_steps": 1630, "loss": 0.1889, "lr": 2.3074551194589816e-06, "epoch": 5.251533742331288, "percentage": 52.52, "elapsed_time": "1:00:21", "remaining_time": "0:54:34"} -{"current_steps": 857, "total_steps": 1630, "loss": 0.0794, "lr": 2.3026513984613506e-06, "epoch": 5.257668711656442, "percentage": 52.58, "elapsed_time": "1:00:23", "remaining_time": "0:54:28"} -{"current_steps": 858, "total_steps": 1630, "loss": 0.0238, "lr": 2.297848410555677e-06, "epoch": 5.263803680981595, "percentage": 52.64, "elapsed_time": "1:00:25", "remaining_time": "0:54:21"} -{"current_steps": 859, "total_steps": 1630, "loss": 0.0369, "lr": 2.293046173583648e-06, "epoch": 5.269938650306749, "percentage": 52.7, "elapsed_time": "1:00:26", "remaining_time": "0:54:14"} -{"current_steps": 860, "total_steps": 1630, "loss": 0.0677, "lr": 2.28824470538416e-06, "epoch": 5.276073619631902, "percentage": 52.76, "elapsed_time": "1:00:27", "remaining_time": "0:54:08"} -{"current_steps": 861, "total_steps": 1630, "loss": 0.0244, "lr": 2.2834440237932537e-06, "epoch": 5.282208588957055, "percentage": 52.82, "elapsed_time": "1:00:30", "remaining_time": "0:54:02"} -{"current_steps": 862, "total_steps": 1630, "loss": 0.0628, "lr": 2.2786441466440474e-06, "epoch": 5.288343558282208, "percentage": 52.88, "elapsed_time": "1:00:33", "remaining_time": "0:53:57"} -{"current_steps": 863, "total_steps": 1630, "loss": 0.0914, "lr": 2.2738450917666727e-06, "epoch": 5.294478527607362, "percentage": 52.94, "elapsed_time": "1:00:35", "remaining_time": "0:53:51"} -{"current_steps": 864, "total_steps": 1630, "loss": 0.0546, "lr": 2.269046876988204e-06, "epoch": 5.300613496932515, "percentage": 53.01, "elapsed_time": "1:00:36", "remaining_time": "0:53:44"} -{"current_steps": 865, "total_steps": 1630, "loss": 0.0473, "lr": 2.2642495201325995e-06, "epoch": 5.306748466257669, "percentage": 53.07, "elapsed_time": "1:00:38", "remaining_time": "0:53:37"} -{"current_steps": 866, "total_steps": 1630, "loss": 0.0613, "lr": 2.259453039020626e-06, "epoch": 5.3128834355828225, "percentage": 53.13, "elapsed_time": "1:00:42", "remaining_time": "0:53:33"} -{"current_steps": 867, "total_steps": 1630, "loss": 0.0533, "lr": 2.2546574514697985e-06, "epoch": 5.319018404907975, "percentage": 53.19, "elapsed_time": "1:00:46", "remaining_time": "0:53:29"} -{"current_steps": 868, "total_steps": 1630, "loss": 0.018, "lr": 2.249862775294313e-06, "epoch": 5.325153374233129, "percentage": 53.25, "elapsed_time": "1:00:47", "remaining_time": "0:53:22"} -{"current_steps": 869, "total_steps": 1630, "loss": 0.0246, "lr": 2.245069028304981e-06, "epoch": 5.331288343558282, "percentage": 53.31, "elapsed_time": "1:00:49", "remaining_time": "0:53:16"} -{"current_steps": 870, "total_steps": 1630, "loss": 0.0551, "lr": 2.240276228309161e-06, "epoch": 5.337423312883436, "percentage": 53.37, "elapsed_time": "1:00:55", "remaining_time": "0:53:12"} -{"current_steps": 871, "total_steps": 1630, "loss": 0.0258, "lr": 2.2354843931106933e-06, "epoch": 5.343558282208589, "percentage": 53.44, "elapsed_time": "1:00:56", "remaining_time": "0:53:06"} -{"current_steps": 872, "total_steps": 1630, "loss": 0.0228, "lr": 2.230693540509836e-06, "epoch": 5.3496932515337425, "percentage": 53.5, "elapsed_time": "1:00:57", "remaining_time": "0:52:59"} -{"current_steps": 873, "total_steps": 1630, "loss": 0.0586, "lr": 2.225903688303195e-06, "epoch": 5.355828220858895, "percentage": 53.56, "elapsed_time": "1:01:02", "remaining_time": "0:52:55"} -{"current_steps": 874, "total_steps": 1630, "loss": 0.0733, "lr": 2.221114854283662e-06, "epoch": 5.361963190184049, "percentage": 53.62, "elapsed_time": "1:01:04", "remaining_time": "0:52:49"} -{"current_steps": 875, "total_steps": 1630, "loss": 0.0251, "lr": 2.2163270562403453e-06, "epoch": 5.368098159509202, "percentage": 53.68, "elapsed_time": "1:01:07", "remaining_time": "0:52:44"} -{"current_steps": 876, "total_steps": 1630, "loss": 0.0957, "lr": 2.211540311958506e-06, "epoch": 5.374233128834356, "percentage": 53.74, "elapsed_time": "1:01:09", "remaining_time": "0:52:38"} -{"current_steps": 877, "total_steps": 1630, "loss": 0.0457, "lr": 2.2067546392194888e-06, "epoch": 5.38036809815951, "percentage": 53.8, "elapsed_time": "1:01:11", "remaining_time": "0:52:32"} -{"current_steps": 878, "total_steps": 1630, "loss": 0.0218, "lr": 2.2019700558006598e-06, "epoch": 5.386503067484663, "percentage": 53.87, "elapsed_time": "1:01:14", "remaining_time": "0:52:26"} -{"current_steps": 879, "total_steps": 1630, "loss": 0.0494, "lr": 2.197186579475337e-06, "epoch": 5.392638036809816, "percentage": 53.93, "elapsed_time": "1:01:17", "remaining_time": "0:52:22"} -{"current_steps": 880, "total_steps": 1630, "loss": 0.0803, "lr": 2.1924042280127284e-06, "epoch": 5.398773006134969, "percentage": 53.99, "elapsed_time": "1:01:19", "remaining_time": "0:52:16"} -{"current_steps": 881, "total_steps": 1630, "loss": 0.0356, "lr": 2.1876230191778598e-06, "epoch": 5.404907975460123, "percentage": 54.05, "elapsed_time": "1:01:21", "remaining_time": "0:52:10"} -{"current_steps": 882, "total_steps": 1630, "loss": 0.1245, "lr": 2.182842970731516e-06, "epoch": 5.411042944785276, "percentage": 54.11, "elapsed_time": "1:01:23", "remaining_time": "0:52:03"} -{"current_steps": 883, "total_steps": 1630, "loss": 0.0224, "lr": 2.17806410043017e-06, "epoch": 5.41717791411043, "percentage": 54.17, "elapsed_time": "1:01:25", "remaining_time": "0:51:57"} -{"current_steps": 884, "total_steps": 1630, "loss": 0.0499, "lr": 2.173286426025917e-06, "epoch": 5.423312883435583, "percentage": 54.23, "elapsed_time": "1:01:28", "remaining_time": "0:51:52"} -{"current_steps": 885, "total_steps": 1630, "loss": 0.075, "lr": 2.168509965266411e-06, "epoch": 5.429447852760736, "percentage": 54.29, "elapsed_time": "1:01:31", "remaining_time": "0:51:47"} -{"current_steps": 886, "total_steps": 1630, "loss": 0.065, "lr": 2.1637347358947984e-06, "epoch": 5.435582822085889, "percentage": 54.36, "elapsed_time": "1:01:35", "remaining_time": "0:51:42"} -{"current_steps": 887, "total_steps": 1630, "loss": 0.0848, "lr": 2.15896075564965e-06, "epoch": 5.441717791411043, "percentage": 54.42, "elapsed_time": "1:01:41", "remaining_time": "0:51:40"} -{"current_steps": 888, "total_steps": 1630, "loss": 0.0112, "lr": 2.1541880422648978e-06, "epoch": 5.447852760736196, "percentage": 54.48, "elapsed_time": "1:01:43", "remaining_time": "0:51:34"} -{"current_steps": 889, "total_steps": 1630, "loss": 0.077, "lr": 2.1494166134697655e-06, "epoch": 5.45398773006135, "percentage": 54.54, "elapsed_time": "1:01:45", "remaining_time": "0:51:28"} -{"current_steps": 890, "total_steps": 1630, "loss": 0.03, "lr": 2.1446464869887077e-06, "epoch": 5.460122699386503, "percentage": 54.6, "elapsed_time": "1:01:48", "remaining_time": "0:51:23"} -{"current_steps": 891, "total_steps": 1630, "loss": 0.0141, "lr": 2.13987768054134e-06, "epoch": 5.466257668711656, "percentage": 54.66, "elapsed_time": "1:01:50", "remaining_time": "0:51:17"} -{"current_steps": 892, "total_steps": 1630, "loss": 0.0147, "lr": 2.135110211842374e-06, "epoch": 5.47239263803681, "percentage": 54.72, "elapsed_time": "1:01:52", "remaining_time": "0:51:11"} -{"current_steps": 893, "total_steps": 1630, "loss": 0.1123, "lr": 2.1303440986015525e-06, "epoch": 5.478527607361963, "percentage": 54.79, "elapsed_time": "1:01:59", "remaining_time": "0:51:09"} -{"current_steps": 894, "total_steps": 1630, "loss": 0.0359, "lr": 2.1255793585235827e-06, "epoch": 5.484662576687117, "percentage": 54.85, "elapsed_time": "1:02:00", "remaining_time": "0:51:03"} -{"current_steps": 895, "total_steps": 1630, "loss": 0.0635, "lr": 2.120816009308071e-06, "epoch": 5.49079754601227, "percentage": 54.91, "elapsed_time": "1:02:02", "remaining_time": "0:50:57"} -{"current_steps": 896, "total_steps": 1630, "loss": 0.1104, "lr": 2.1160540686494597e-06, "epoch": 5.4969325153374236, "percentage": 54.97, "elapsed_time": "1:02:04", "remaining_time": "0:50:50"} -{"current_steps": 897, "total_steps": 1630, "loss": 0.0187, "lr": 2.1112935542369546e-06, "epoch": 5.5030674846625764, "percentage": 55.03, "elapsed_time": "1:02:07", "remaining_time": "0:50:45"} -{"current_steps": 898, "total_steps": 1630, "loss": 0.0874, "lr": 2.106534483754466e-06, "epoch": 5.50920245398773, "percentage": 55.09, "elapsed_time": "1:02:11", "remaining_time": "0:50:41"} -{"current_steps": 899, "total_steps": 1630, "loss": 0.0301, "lr": 2.1017768748805396e-06, "epoch": 5.515337423312883, "percentage": 55.15, "elapsed_time": "1:02:13", "remaining_time": "0:50:35"} -{"current_steps": 900, "total_steps": 1630, "loss": 0.1192, "lr": 2.0970207452882917e-06, "epoch": 5.521472392638037, "percentage": 55.21, "elapsed_time": "1:02:16", "remaining_time": "0:50:30"} -{"current_steps": 901, "total_steps": 1630, "loss": 0.0803, "lr": 2.0922661126453436e-06, "epoch": 5.52760736196319, "percentage": 55.28, "elapsed_time": "1:02:19", "remaining_time": "0:50:25"} -{"current_steps": 902, "total_steps": 1630, "loss": 0.0186, "lr": 2.0875129946137557e-06, "epoch": 5.533742331288344, "percentage": 55.34, "elapsed_time": "1:02:21", "remaining_time": "0:50:19"} -{"current_steps": 903, "total_steps": 1630, "loss": 0.0499, "lr": 2.0827614088499624e-06, "epoch": 5.539877300613497, "percentage": 55.4, "elapsed_time": "1:02:23", "remaining_time": "0:50:14"} -{"current_steps": 904, "total_steps": 1630, "loss": 0.0322, "lr": 2.0780113730047056e-06, "epoch": 5.54601226993865, "percentage": 55.46, "elapsed_time": "1:02:28", "remaining_time": "0:50:10"} -{"current_steps": 905, "total_steps": 1630, "loss": 0.0265, "lr": 2.0732629047229712e-06, "epoch": 5.552147239263804, "percentage": 55.52, "elapsed_time": "1:02:29", "remaining_time": "0:50:03"} -{"current_steps": 906, "total_steps": 1630, "loss": 0.0229, "lr": 2.0685160216439205e-06, "epoch": 5.558282208588957, "percentage": 55.58, "elapsed_time": "1:02:32", "remaining_time": "0:49:58"} -{"current_steps": 907, "total_steps": 1630, "loss": 0.0266, "lr": 2.0637707414008267e-06, "epoch": 5.564417177914111, "percentage": 55.64, "elapsed_time": "1:02:34", "remaining_time": "0:49:53"} -{"current_steps": 908, "total_steps": 1630, "loss": 0.018, "lr": 2.0590270816210077e-06, "epoch": 5.570552147239264, "percentage": 55.71, "elapsed_time": "1:02:37", "remaining_time": "0:49:47"} -{"current_steps": 909, "total_steps": 1630, "loss": 0.0377, "lr": 2.0542850599257647e-06, "epoch": 5.576687116564417, "percentage": 55.77, "elapsed_time": "1:02:39", "remaining_time": "0:49:41"} -{"current_steps": 910, "total_steps": 1630, "loss": 0.1224, "lr": 2.0495446939303122e-06, "epoch": 5.58282208588957, "percentage": 55.83, "elapsed_time": "1:02:43", "remaining_time": "0:49:37"} -{"current_steps": 911, "total_steps": 1630, "loss": 0.1457, "lr": 2.044806001243714e-06, "epoch": 5.588957055214724, "percentage": 55.89, "elapsed_time": "1:02:46", "remaining_time": "0:49:32"} -{"current_steps": 912, "total_steps": 1630, "loss": 0.0429, "lr": 2.040068999468818e-06, "epoch": 5.595092024539877, "percentage": 55.95, "elapsed_time": "1:02:49", "remaining_time": "0:49:27"} -{"current_steps": 913, "total_steps": 1630, "loss": 0.0634, "lr": 2.035333706202192e-06, "epoch": 5.601226993865031, "percentage": 56.01, "elapsed_time": "1:02:50", "remaining_time": "0:49:21"} -{"current_steps": 914, "total_steps": 1630, "loss": 0.0178, "lr": 2.0306001390340565e-06, "epoch": 5.6073619631901845, "percentage": 56.07, "elapsed_time": "1:02:52", "remaining_time": "0:49:14"} -{"current_steps": 915, "total_steps": 1630, "loss": 0.037, "lr": 2.02586831554822e-06, "epoch": 5.613496932515337, "percentage": 56.13, "elapsed_time": "1:02:53", "remaining_time": "0:49:08"} -{"current_steps": 916, "total_steps": 1630, "loss": 0.125, "lr": 2.021138253322012e-06, "epoch": 5.61963190184049, "percentage": 56.2, "elapsed_time": "1:03:00", "remaining_time": "0:49:06"} -{"current_steps": 917, "total_steps": 1630, "loss": 0.1897, "lr": 2.016409969926224e-06, "epoch": 5.625766871165644, "percentage": 56.26, "elapsed_time": "1:03:03", "remaining_time": "0:49:02"} -{"current_steps": 918, "total_steps": 1630, "loss": 0.0546, "lr": 2.0116834829250355e-06, "epoch": 5.631901840490798, "percentage": 56.32, "elapsed_time": "1:03:05", "remaining_time": "0:48:55"} -{"current_steps": 919, "total_steps": 1630, "loss": 0.0911, "lr": 2.0069588098759545e-06, "epoch": 5.638036809815951, "percentage": 56.38, "elapsed_time": "1:03:06", "remaining_time": "0:48:49"} -{"current_steps": 920, "total_steps": 1630, "loss": 0.0527, "lr": 2.00223596832975e-06, "epoch": 5.644171779141105, "percentage": 56.44, "elapsed_time": "1:03:09", "remaining_time": "0:48:44"} -{"current_steps": 921, "total_steps": 1630, "loss": 0.0384, "lr": 1.9975149758303885e-06, "epoch": 5.6503067484662575, "percentage": 56.5, "elapsed_time": "1:03:15", "remaining_time": "0:48:41"} -{"current_steps": 922, "total_steps": 1630, "loss": 0.1033, "lr": 1.992795849914967e-06, "epoch": 5.656441717791411, "percentage": 56.56, "elapsed_time": "1:03:18", "remaining_time": "0:48:36"} -{"current_steps": 923, "total_steps": 1630, "loss": 0.08, "lr": 1.9880786081136498e-06, "epoch": 5.662576687116564, "percentage": 56.63, "elapsed_time": "1:03:19", "remaining_time": "0:48:30"} -{"current_steps": 924, "total_steps": 1630, "loss": 0.0819, "lr": 1.9833632679496008e-06, "epoch": 5.668711656441718, "percentage": 56.69, "elapsed_time": "1:03:22", "remaining_time": "0:48:25"} -{"current_steps": 925, "total_steps": 1630, "loss": 0.117, "lr": 1.97864984693892e-06, "epoch": 5.674846625766871, "percentage": 56.75, "elapsed_time": "1:03:25", "remaining_time": "0:48:20"} -{"current_steps": 926, "total_steps": 1630, "loss": 0.0215, "lr": 1.97393836259058e-06, "epoch": 5.680981595092025, "percentage": 56.81, "elapsed_time": "1:03:27", "remaining_time": "0:48:14"} -{"current_steps": 927, "total_steps": 1630, "loss": 0.1422, "lr": 1.969228832406358e-06, "epoch": 5.6871165644171775, "percentage": 56.87, "elapsed_time": "1:03:30", "remaining_time": "0:48:09"} -{"current_steps": 928, "total_steps": 1630, "loss": 0.0538, "lr": 1.964521273880772e-06, "epoch": 5.693251533742331, "percentage": 56.93, "elapsed_time": "1:03:31", "remaining_time": "0:48:03"} -{"current_steps": 929, "total_steps": 1630, "loss": 0.114, "lr": 1.9598157045010162e-06, "epoch": 5.699386503067485, "percentage": 56.99, "elapsed_time": "1:03:35", "remaining_time": "0:47:59"} -{"current_steps": 930, "total_steps": 1630, "loss": 0.053, "lr": 1.9551121417468955e-06, "epoch": 5.705521472392638, "percentage": 57.06, "elapsed_time": "1:03:38", "remaining_time": "0:47:54"} -{"current_steps": 931, "total_steps": 1630, "loss": 0.0866, "lr": 1.9504106030907605e-06, "epoch": 5.711656441717792, "percentage": 57.12, "elapsed_time": "1:03:42", "remaining_time": "0:47:50"} -{"current_steps": 932, "total_steps": 1630, "loss": 0.0908, "lr": 1.945711105997444e-06, "epoch": 5.717791411042945, "percentage": 57.18, "elapsed_time": "1:03:45", "remaining_time": "0:47:45"} -{"current_steps": 933, "total_steps": 1630, "loss": 0.0612, "lr": 1.941013667924194e-06, "epoch": 5.723926380368098, "percentage": 57.24, "elapsed_time": "1:03:47", "remaining_time": "0:47:38"} -{"current_steps": 934, "total_steps": 1630, "loss": 0.0283, "lr": 1.9363183063206097e-06, "epoch": 5.730061349693251, "percentage": 57.3, "elapsed_time": "1:03:50", "remaining_time": "0:47:34"} -{"current_steps": 935, "total_steps": 1630, "loss": 0.0948, "lr": 1.931625038628577e-06, "epoch": 5.736196319018405, "percentage": 57.36, "elapsed_time": "1:03:51", "remaining_time": "0:47:28"} -{"current_steps": 936, "total_steps": 1630, "loss": 0.0769, "lr": 1.9269338822822047e-06, "epoch": 5.742331288343558, "percentage": 57.42, "elapsed_time": "1:03:56", "remaining_time": "0:47:24"} -{"current_steps": 937, "total_steps": 1630, "loss": 0.098, "lr": 1.9222448547077573e-06, "epoch": 5.748466257668712, "percentage": 57.48, "elapsed_time": "1:03:58", "remaining_time": "0:47:19"} -{"current_steps": 938, "total_steps": 1630, "loss": 0.0363, "lr": 1.917557973323591e-06, "epoch": 5.754601226993865, "percentage": 57.55, "elapsed_time": "1:04:01", "remaining_time": "0:47:14"} -{"current_steps": 939, "total_steps": 1630, "loss": 0.0205, "lr": 1.9128732555400915e-06, "epoch": 5.7607361963190185, "percentage": 57.61, "elapsed_time": "1:04:04", "remaining_time": "0:47:08"} -{"current_steps": 940, "total_steps": 1630, "loss": 0.0548, "lr": 1.9081907187596054e-06, "epoch": 5.766871165644172, "percentage": 57.67, "elapsed_time": "1:04:08", "remaining_time": "0:47:04"} -{"current_steps": 941, "total_steps": 1630, "loss": 0.0454, "lr": 1.9035103803763793e-06, "epoch": 5.773006134969325, "percentage": 57.73, "elapsed_time": "1:04:11", "remaining_time": "0:47:00"} -{"current_steps": 942, "total_steps": 1630, "loss": 0.0514, "lr": 1.8988322577764918e-06, "epoch": 5.779141104294479, "percentage": 57.79, "elapsed_time": "1:04:15", "remaining_time": "0:46:55"} -{"current_steps": 943, "total_steps": 1630, "loss": 0.1361, "lr": 1.8941563683377905e-06, "epoch": 5.785276073619632, "percentage": 57.85, "elapsed_time": "1:04:19", "remaining_time": "0:46:51"} -{"current_steps": 944, "total_steps": 1630, "loss": 0.0139, "lr": 1.8894827294298296e-06, "epoch": 5.791411042944786, "percentage": 57.91, "elapsed_time": "1:04:21", "remaining_time": "0:46:45"} -{"current_steps": 945, "total_steps": 1630, "loss": 0.0311, "lr": 1.884811358413801e-06, "epoch": 5.7975460122699385, "percentage": 57.98, "elapsed_time": "1:04:23", "remaining_time": "0:46:40"} -{"current_steps": 946, "total_steps": 1630, "loss": 0.0227, "lr": 1.8801422726424735e-06, "epoch": 5.803680981595092, "percentage": 58.04, "elapsed_time": "1:04:26", "remaining_time": "0:46:35"} -{"current_steps": 947, "total_steps": 1630, "loss": 0.0157, "lr": 1.8754754894601252e-06, "epoch": 5.809815950920245, "percentage": 58.1, "elapsed_time": "1:04:29", "remaining_time": "0:46:30"} -{"current_steps": 948, "total_steps": 1630, "loss": 0.1093, "lr": 1.870811026202482e-06, "epoch": 5.815950920245399, "percentage": 58.16, "elapsed_time": "1:04:33", "remaining_time": "0:46:26"} -{"current_steps": 949, "total_steps": 1630, "loss": 0.021, "lr": 1.8661489001966526e-06, "epoch": 5.822085889570552, "percentage": 58.22, "elapsed_time": "1:04:36", "remaining_time": "0:46:21"} -{"current_steps": 950, "total_steps": 1630, "loss": 0.0663, "lr": 1.8614891287610621e-06, "epoch": 5.828220858895706, "percentage": 58.28, "elapsed_time": "1:04:37", "remaining_time": "0:46:15"} -{"current_steps": 951, "total_steps": 1630, "loss": 0.1008, "lr": 1.8568317292053894e-06, "epoch": 5.8343558282208585, "percentage": 58.34, "elapsed_time": "1:04:43", "remaining_time": "0:46:12"} -{"current_steps": 952, "total_steps": 1630, "loss": 0.0451, "lr": 1.8521767188305023e-06, "epoch": 5.840490797546012, "percentage": 58.4, "elapsed_time": "1:04:47", "remaining_time": "0:46:08"} -{"current_steps": 953, "total_steps": 1630, "loss": 0.0561, "lr": 1.8475241149283957e-06, "epoch": 5.846625766871165, "percentage": 58.47, "elapsed_time": "1:04:51", "remaining_time": "0:46:04"} -{"current_steps": 954, "total_steps": 1630, "loss": 0.0265, "lr": 1.842873934782122e-06, "epoch": 5.852760736196319, "percentage": 58.53, "elapsed_time": "1:04:55", "remaining_time": "0:46:00"} -{"current_steps": 955, "total_steps": 1630, "loss": 0.1196, "lr": 1.8382261956657318e-06, "epoch": 5.858895705521473, "percentage": 58.59, "elapsed_time": "1:05:01", "remaining_time": "0:45:57"} -{"current_steps": 956, "total_steps": 1630, "loss": 0.1356, "lr": 1.8335809148442074e-06, "epoch": 5.865030674846626, "percentage": 58.65, "elapsed_time": "1:05:05", "remaining_time": "0:45:53"} -{"current_steps": 957, "total_steps": 1630, "loss": 0.0444, "lr": 1.8289381095734005e-06, "epoch": 5.871165644171779, "percentage": 58.71, "elapsed_time": "1:05:07", "remaining_time": "0:45:47"} -{"current_steps": 958, "total_steps": 1630, "loss": 0.0622, "lr": 1.8242977970999643e-06, "epoch": 5.877300613496932, "percentage": 58.77, "elapsed_time": "1:05:13", "remaining_time": "0:45:45"} -{"current_steps": 959, "total_steps": 1630, "loss": 0.0762, "lr": 1.8196599946612956e-06, "epoch": 5.883435582822086, "percentage": 58.83, "elapsed_time": "1:05:15", "remaining_time": "0:45:39"} -{"current_steps": 960, "total_steps": 1630, "loss": 0.0207, "lr": 1.8150247194854642e-06, "epoch": 5.889570552147239, "percentage": 58.9, "elapsed_time": "1:05:17", "remaining_time": "0:45:33"} -{"current_steps": 961, "total_steps": 1630, "loss": 0.1122, "lr": 1.8103919887911525e-06, "epoch": 5.895705521472393, "percentage": 58.96, "elapsed_time": "1:05:23", "remaining_time": "0:45:31"} -{"current_steps": 962, "total_steps": 1630, "loss": 0.0284, "lr": 1.8057618197875914e-06, "epoch": 5.901840490797546, "percentage": 59.02, "elapsed_time": "1:05:25", "remaining_time": "0:45:25"} -{"current_steps": 963, "total_steps": 1630, "loss": 0.0239, "lr": 1.8011342296744961e-06, "epoch": 5.9079754601226995, "percentage": 59.08, "elapsed_time": "1:05:28", "remaining_time": "0:45:21"} -{"current_steps": 964, "total_steps": 1630, "loss": 0.0425, "lr": 1.796509235642001e-06, "epoch": 5.914110429447852, "percentage": 59.14, "elapsed_time": "1:05:31", "remaining_time": "0:45:15"} -{"current_steps": 965, "total_steps": 1630, "loss": 0.2094, "lr": 1.7918868548705982e-06, "epoch": 5.920245398773006, "percentage": 59.2, "elapsed_time": "1:05:33", "remaining_time": "0:45:10"} -{"current_steps": 966, "total_steps": 1630, "loss": 0.0632, "lr": 1.7872671045310703e-06, "epoch": 5.92638036809816, "percentage": 59.26, "elapsed_time": "1:05:36", "remaining_time": "0:45:05"} -{"current_steps": 967, "total_steps": 1630, "loss": 0.1411, "lr": 1.782650001784431e-06, "epoch": 5.932515337423313, "percentage": 59.33, "elapsed_time": "1:05:39", "remaining_time": "0:45:01"} -{"current_steps": 968, "total_steps": 1630, "loss": 0.0965, "lr": 1.7780355637818568e-06, "epoch": 5.938650306748467, "percentage": 59.39, "elapsed_time": "1:05:40", "remaining_time": "0:44:55"} -{"current_steps": 969, "total_steps": 1630, "loss": 0.0568, "lr": 1.7734238076646277e-06, "epoch": 5.9447852760736195, "percentage": 59.45, "elapsed_time": "1:05:45", "remaining_time": "0:44:51"} -{"current_steps": 970, "total_steps": 1630, "loss": 0.0182, "lr": 1.7688147505640581e-06, "epoch": 5.950920245398773, "percentage": 59.51, "elapsed_time": "1:05:46", "remaining_time": "0:44:45"} -{"current_steps": 971, "total_steps": 1630, "loss": 0.0547, "lr": 1.7642084096014405e-06, "epoch": 5.957055214723926, "percentage": 59.57, "elapsed_time": "1:05:47", "remaining_time": "0:44:39"} -{"current_steps": 972, "total_steps": 1630, "loss": 0.0775, "lr": 1.759604801887974e-06, "epoch": 5.96319018404908, "percentage": 59.63, "elapsed_time": "1:05:50", "remaining_time": "0:44:34"} -{"current_steps": 973, "total_steps": 1630, "loss": 0.0541, "lr": 1.7550039445247069e-06, "epoch": 5.969325153374233, "percentage": 59.69, "elapsed_time": "1:05:53", "remaining_time": "0:44:29"} -{"current_steps": 974, "total_steps": 1630, "loss": 0.0257, "lr": 1.7504058546024694e-06, "epoch": 5.975460122699387, "percentage": 59.75, "elapsed_time": "1:05:54", "remaining_time": "0:44:23"} -{"current_steps": 975, "total_steps": 1630, "loss": 0.0767, "lr": 1.7458105492018114e-06, "epoch": 5.9815950920245395, "percentage": 59.82, "elapsed_time": "1:05:56", "remaining_time": "0:44:17"} -{"current_steps": 976, "total_steps": 1630, "loss": 0.025, "lr": 1.7412180453929412e-06, "epoch": 5.987730061349693, "percentage": 59.88, "elapsed_time": "1:06:00", "remaining_time": "0:44:14"} -{"current_steps": 977, "total_steps": 1630, "loss": 0.0183, "lr": 1.736628360235657e-06, "epoch": 5.993865030674847, "percentage": 59.94, "elapsed_time": "1:06:02", "remaining_time": "0:44:08"} -{"current_steps": 978, "total_steps": 1630, "loss": 0.1369, "lr": 1.7320415107792893e-06, "epoch": 6.0, "percentage": 60.0, "elapsed_time": "1:06:05", "remaining_time": "0:44:03"} -{"current_steps": 979, "total_steps": 1630, "loss": 0.1011, "lr": 1.7274575140626318e-06, "epoch": 6.006134969325154, "percentage": 60.06, "elapsed_time": "1:09:33", "remaining_time": "0:46:15"} -{"current_steps": 980, "total_steps": 1630, "loss": 0.0105, "lr": 1.7228763871138845e-06, "epoch": 6.012269938650307, "percentage": 60.12, "elapsed_time": "1:09:34", "remaining_time": "0:46:08"} -{"current_steps": 981, "total_steps": 1630, "loss": 0.0373, "lr": 1.718298146950585e-06, "epoch": 6.0184049079754605, "percentage": 60.18, "elapsed_time": "1:09:36", "remaining_time": "0:46:03"} -{"current_steps": 982, "total_steps": 1630, "loss": 0.0072, "lr": 1.7137228105795473e-06, "epoch": 6.024539877300613, "percentage": 60.25, "elapsed_time": "1:09:37", "remaining_time": "0:45:56"} -{"current_steps": 983, "total_steps": 1630, "loss": 0.0126, "lr": 1.7091503949967987e-06, "epoch": 6.030674846625767, "percentage": 60.31, "elapsed_time": "1:09:40", "remaining_time": "0:45:51"} -{"current_steps": 984, "total_steps": 1630, "loss": 0.0198, "lr": 1.7045809171875183e-06, "epoch": 6.03680981595092, "percentage": 60.37, "elapsed_time": "1:09:41", "remaining_time": "0:45:45"} -{"current_steps": 985, "total_steps": 1630, "loss": 0.0186, "lr": 1.70001439412597e-06, "epoch": 6.042944785276074, "percentage": 60.43, "elapsed_time": "1:09:43", "remaining_time": "0:45:39"} -{"current_steps": 986, "total_steps": 1630, "loss": 0.0197, "lr": 1.6954508427754435e-06, "epoch": 6.049079754601227, "percentage": 60.49, "elapsed_time": "1:09:45", "remaining_time": "0:45:33"} -{"current_steps": 987, "total_steps": 1630, "loss": 0.0192, "lr": 1.690890280088187e-06, "epoch": 6.0552147239263805, "percentage": 60.55, "elapsed_time": "1:09:47", "remaining_time": "0:45:27"} -{"current_steps": 988, "total_steps": 1630, "loss": 0.0105, "lr": 1.6863327230053506e-06, "epoch": 6.061349693251533, "percentage": 60.61, "elapsed_time": "1:09:50", "remaining_time": "0:45:22"} -{"current_steps": 989, "total_steps": 1630, "loss": 0.0275, "lr": 1.6817781884569146e-06, "epoch": 6.067484662576687, "percentage": 60.67, "elapsed_time": "1:09:52", "remaining_time": "0:45:17"} -{"current_steps": 990, "total_steps": 1630, "loss": 0.0095, "lr": 1.677226693361636e-06, "epoch": 6.07361963190184, "percentage": 60.74, "elapsed_time": "1:09:54", "remaining_time": "0:45:11"} -{"current_steps": 991, "total_steps": 1630, "loss": 0.0483, "lr": 1.6726782546269793e-06, "epoch": 6.079754601226994, "percentage": 60.8, "elapsed_time": "1:09:57", "remaining_time": "0:45:06"} -{"current_steps": 992, "total_steps": 1630, "loss": 0.0815, "lr": 1.6681328891490544e-06, "epoch": 6.085889570552148, "percentage": 60.86, "elapsed_time": "1:10:04", "remaining_time": "0:45:03"} -{"current_steps": 993, "total_steps": 1630, "loss": 0.0216, "lr": 1.663590613812556e-06, "epoch": 6.0920245398773005, "percentage": 60.92, "elapsed_time": "1:10:05", "remaining_time": "0:44:57"} -{"current_steps": 994, "total_steps": 1630, "loss": 0.0243, "lr": 1.6590514454907007e-06, "epoch": 6.098159509202454, "percentage": 60.98, "elapsed_time": "1:10:10", "remaining_time": "0:44:53"} -{"current_steps": 995, "total_steps": 1630, "loss": 0.0669, "lr": 1.6545154010451613e-06, "epoch": 6.104294478527607, "percentage": 61.04, "elapsed_time": "1:10:13", "remaining_time": "0:44:48"} -{"current_steps": 996, "total_steps": 1630, "loss": 0.0309, "lr": 1.6499824973260086e-06, "epoch": 6.110429447852761, "percentage": 61.1, "elapsed_time": "1:10:15", "remaining_time": "0:44:43"} -{"current_steps": 997, "total_steps": 1630, "loss": 0.026, "lr": 1.645452751171645e-06, "epoch": 6.116564417177914, "percentage": 61.17, "elapsed_time": "1:10:20", "remaining_time": "0:44:39"} -{"current_steps": 998, "total_steps": 1630, "loss": 0.0191, "lr": 1.6409261794087438e-06, "epoch": 6.122699386503068, "percentage": 61.23, "elapsed_time": "1:10:23", "remaining_time": "0:44:34"} -{"current_steps": 999, "total_steps": 1630, "loss": 0.045, "lr": 1.6364027988521875e-06, "epoch": 6.128834355828221, "percentage": 61.29, "elapsed_time": "1:10:26", "remaining_time": "0:44:29"} -{"current_steps": 1000, "total_steps": 1630, "loss": 0.0197, "lr": 1.6318826263050022e-06, "epoch": 6.134969325153374, "percentage": 61.35, "elapsed_time": "1:10:27", "remaining_time": "0:44:23"} -{"current_steps": 1001, "total_steps": 1630, "loss": 0.0092, "lr": 1.6273656785582986e-06, "epoch": 6.141104294478527, "percentage": 61.41, "elapsed_time": "1:10:32", "remaining_time": "0:44:19"} -{"current_steps": 1002, "total_steps": 1630, "loss": 0.0141, "lr": 1.6228519723912073e-06, "epoch": 6.147239263803681, "percentage": 61.47, "elapsed_time": "1:10:36", "remaining_time": "0:44:15"} -{"current_steps": 1003, "total_steps": 1630, "loss": 0.0131, "lr": 1.618341524570819e-06, "epoch": 6.153374233128835, "percentage": 61.53, "elapsed_time": "1:10:39", "remaining_time": "0:44:10"} -{"current_steps": 1004, "total_steps": 1630, "loss": 0.0686, "lr": 1.613834351852119e-06, "epoch": 6.159509202453988, "percentage": 61.6, "elapsed_time": "1:10:46", "remaining_time": "0:44:07"} -{"current_steps": 1005, "total_steps": 1630, "loss": 0.036, "lr": 1.6093304709779273e-06, "epoch": 6.1656441717791415, "percentage": 61.66, "elapsed_time": "1:10:48", "remaining_time": "0:44:02"} -{"current_steps": 1006, "total_steps": 1630, "loss": 0.0216, "lr": 1.6048298986788345e-06, "epoch": 6.171779141104294, "percentage": 61.72, "elapsed_time": "1:10:54", "remaining_time": "0:43:58"} -{"current_steps": 1007, "total_steps": 1630, "loss": 0.024, "lr": 1.6003326516731431e-06, "epoch": 6.177914110429448, "percentage": 61.78, "elapsed_time": "1:10:56", "remaining_time": "0:43:53"} -{"current_steps": 1008, "total_steps": 1630, "loss": 0.0133, "lr": 1.5958387466668015e-06, "epoch": 6.184049079754601, "percentage": 61.84, "elapsed_time": "1:10:59", "remaining_time": "0:43:48"} -{"current_steps": 1009, "total_steps": 1630, "loss": 0.0331, "lr": 1.5913482003533437e-06, "epoch": 6.190184049079755, "percentage": 61.9, "elapsed_time": "1:11:00", "remaining_time": "0:43:42"} -{"current_steps": 1010, "total_steps": 1630, "loss": 0.0111, "lr": 1.5868610294138264e-06, "epoch": 6.196319018404908, "percentage": 61.96, "elapsed_time": "1:11:02", "remaining_time": "0:43:36"} -{"current_steps": 1011, "total_steps": 1630, "loss": 0.0112, "lr": 1.58237725051677e-06, "epoch": 6.2024539877300615, "percentage": 62.02, "elapsed_time": "1:11:06", "remaining_time": "0:43:32"} -{"current_steps": 1012, "total_steps": 1630, "loss": 0.0181, "lr": 1.577896880318093e-06, "epoch": 6.208588957055214, "percentage": 62.09, "elapsed_time": "1:11:08", "remaining_time": "0:43:26"} -{"current_steps": 1013, "total_steps": 1630, "loss": 0.0135, "lr": 1.5734199354610513e-06, "epoch": 6.214723926380368, "percentage": 62.15, "elapsed_time": "1:11:09", "remaining_time": "0:43:20"} -{"current_steps": 1014, "total_steps": 1630, "loss": 0.0163, "lr": 1.5689464325761764e-06, "epoch": 6.220858895705521, "percentage": 62.21, "elapsed_time": "1:11:12", "remaining_time": "0:43:15"} -{"current_steps": 1015, "total_steps": 1630, "loss": 0.0068, "lr": 1.564476388281216e-06, "epoch": 6.226993865030675, "percentage": 62.27, "elapsed_time": "1:11:14", "remaining_time": "0:43:10"} -{"current_steps": 1016, "total_steps": 1630, "loss": 0.021, "lr": 1.5600098191810682e-06, "epoch": 6.233128834355828, "percentage": 62.33, "elapsed_time": "1:11:17", "remaining_time": "0:43:05"} -{"current_steps": 1017, "total_steps": 1630, "loss": 0.0349, "lr": 1.555546741867722e-06, "epoch": 6.2392638036809815, "percentage": 62.39, "elapsed_time": "1:11:18", "remaining_time": "0:42:59"} -{"current_steps": 1018, "total_steps": 1630, "loss": 0.0626, "lr": 1.5510871729201953e-06, "epoch": 6.245398773006135, "percentage": 62.45, "elapsed_time": "1:11:24", "remaining_time": "0:42:55"} -{"current_steps": 1019, "total_steps": 1630, "loss": 0.0082, "lr": 1.5466311289044755e-06, "epoch": 6.251533742331288, "percentage": 62.52, "elapsed_time": "1:11:26", "remaining_time": "0:42:50"} -{"current_steps": 1020, "total_steps": 1630, "loss": 0.0212, "lr": 1.5421786263734524e-06, "epoch": 6.257668711656442, "percentage": 62.58, "elapsed_time": "1:11:28", "remaining_time": "0:42:44"} -{"current_steps": 1021, "total_steps": 1630, "loss": 0.0963, "lr": 1.5377296818668638e-06, "epoch": 6.263803680981595, "percentage": 62.64, "elapsed_time": "1:11:32", "remaining_time": "0:42:40"} -{"current_steps": 1022, "total_steps": 1630, "loss": 0.011, "lr": 1.5332843119112285e-06, "epoch": 6.269938650306749, "percentage": 62.7, "elapsed_time": "1:11:34", "remaining_time": "0:42:34"} -{"current_steps": 1023, "total_steps": 1630, "loss": 0.018, "lr": 1.5288425330197864e-06, "epoch": 6.276073619631902, "percentage": 62.76, "elapsed_time": "1:11:35", "remaining_time": "0:42:28"} -{"current_steps": 1024, "total_steps": 1630, "loss": 0.0162, "lr": 1.5244043616924389e-06, "epoch": 6.282208588957055, "percentage": 62.82, "elapsed_time": "1:11:37", "remaining_time": "0:42:23"} -{"current_steps": 1025, "total_steps": 1630, "loss": 0.0468, "lr": 1.5199698144156865e-06, "epoch": 6.288343558282208, "percentage": 62.88, "elapsed_time": "1:11:40", "remaining_time": "0:42:18"} -{"current_steps": 1026, "total_steps": 1630, "loss": 0.0064, "lr": 1.5155389076625663e-06, "epoch": 6.294478527607362, "percentage": 62.94, "elapsed_time": "1:11:41", "remaining_time": "0:42:12"} -{"current_steps": 1027, "total_steps": 1630, "loss": 0.035, "lr": 1.5111116578925924e-06, "epoch": 6.300613496932515, "percentage": 63.01, "elapsed_time": "1:11:43", "remaining_time": "0:42:06"} -{"current_steps": 1028, "total_steps": 1630, "loss": 0.0197, "lr": 1.5066880815516943e-06, "epoch": 6.306748466257669, "percentage": 63.07, "elapsed_time": "1:11:45", "remaining_time": "0:42:01"} -{"current_steps": 1029, "total_steps": 1630, "loss": 0.0059, "lr": 1.5022681950721565e-06, "epoch": 6.3128834355828225, "percentage": 63.13, "elapsed_time": "1:11:47", "remaining_time": "0:41:55"} -{"current_steps": 1030, "total_steps": 1630, "loss": 0.006, "lr": 1.4978520148725558e-06, "epoch": 6.319018404907975, "percentage": 63.19, "elapsed_time": "1:11:48", "remaining_time": "0:41:49"} -{"current_steps": 1031, "total_steps": 1630, "loss": 0.0126, "lr": 1.4934395573577016e-06, "epoch": 6.325153374233129, "percentage": 63.25, "elapsed_time": "1:11:49", "remaining_time": "0:41:43"} -{"current_steps": 1032, "total_steps": 1630, "loss": 0.0131, "lr": 1.4890308389185743e-06, "epoch": 6.331288343558282, "percentage": 63.31, "elapsed_time": "1:11:52", "remaining_time": "0:41:38"} -{"current_steps": 1033, "total_steps": 1630, "loss": 0.016, "lr": 1.484625875932265e-06, "epoch": 6.337423312883436, "percentage": 63.37, "elapsed_time": "1:11:56", "remaining_time": "0:41:34"} -{"current_steps": 1034, "total_steps": 1630, "loss": 0.1059, "lr": 1.480224684761915e-06, "epoch": 6.343558282208589, "percentage": 63.44, "elapsed_time": "1:11:58", "remaining_time": "0:41:29"} -{"current_steps": 1035, "total_steps": 1630, "loss": 0.0312, "lr": 1.4758272817566538e-06, "epoch": 6.3496932515337425, "percentage": 63.5, "elapsed_time": "1:12:02", "remaining_time": "0:41:25"} -{"current_steps": 1036, "total_steps": 1630, "loss": 0.0215, "lr": 1.4714336832515386e-06, "epoch": 6.355828220858895, "percentage": 63.56, "elapsed_time": "1:12:05", "remaining_time": "0:41:20"} -{"current_steps": 1037, "total_steps": 1630, "loss": 0.0718, "lr": 1.467043905567494e-06, "epoch": 6.361963190184049, "percentage": 63.62, "elapsed_time": "1:12:09", "remaining_time": "0:41:15"} -{"current_steps": 1038, "total_steps": 1630, "loss": 0.0166, "lr": 1.4626579650112533e-06, "epoch": 6.368098159509202, "percentage": 63.68, "elapsed_time": "1:12:12", "remaining_time": "0:41:10"} -{"current_steps": 1039, "total_steps": 1630, "loss": 0.0448, "lr": 1.4582758778752926e-06, "epoch": 6.374233128834356, "percentage": 63.74, "elapsed_time": "1:12:17", "remaining_time": "0:41:07"} -{"current_steps": 1040, "total_steps": 1630, "loss": 0.0297, "lr": 1.4538976604377781e-06, "epoch": 6.38036809815951, "percentage": 63.8, "elapsed_time": "1:12:23", "remaining_time": "0:41:04"} -{"current_steps": 1041, "total_steps": 1630, "loss": 0.0409, "lr": 1.449523328962496e-06, "epoch": 6.386503067484663, "percentage": 63.87, "elapsed_time": "1:12:24", "remaining_time": "0:40:58"} -{"current_steps": 1042, "total_steps": 1630, "loss": 0.0127, "lr": 1.4451528996988018e-06, "epoch": 6.392638036809816, "percentage": 63.93, "elapsed_time": "1:12:28", "remaining_time": "0:40:53"} -{"current_steps": 1043, "total_steps": 1630, "loss": 0.0788, "lr": 1.4407863888815527e-06, "epoch": 6.398773006134969, "percentage": 63.99, "elapsed_time": "1:12:34", "remaining_time": "0:40:50"} -{"current_steps": 1044, "total_steps": 1630, "loss": 0.0082, "lr": 1.436423812731051e-06, "epoch": 6.404907975460123, "percentage": 64.05, "elapsed_time": "1:12:36", "remaining_time": "0:40:45"} -{"current_steps": 1045, "total_steps": 1630, "loss": 0.0086, "lr": 1.432065187452984e-06, "epoch": 6.411042944785276, "percentage": 64.11, "elapsed_time": "1:12:37", "remaining_time": "0:40:39"} -{"current_steps": 1046, "total_steps": 1630, "loss": 0.04, "lr": 1.4277105292383594e-06, "epoch": 6.41717791411043, "percentage": 64.17, "elapsed_time": "1:12:43", "remaining_time": "0:40:36"} -{"current_steps": 1047, "total_steps": 1630, "loss": 0.0054, "lr": 1.4233598542634519e-06, "epoch": 6.423312883435583, "percentage": 64.23, "elapsed_time": "1:12:47", "remaining_time": "0:40:31"} -{"current_steps": 1048, "total_steps": 1630, "loss": 0.0263, "lr": 1.4190131786897388e-06, "epoch": 6.429447852760736, "percentage": 64.29, "elapsed_time": "1:12:50", "remaining_time": "0:40:26"} -{"current_steps": 1049, "total_steps": 1630, "loss": 0.0098, "lr": 1.4146705186638388e-06, "epoch": 6.435582822085889, "percentage": 64.36, "elapsed_time": "1:12:52", "remaining_time": "0:40:21"} -{"current_steps": 1050, "total_steps": 1630, "loss": 0.0322, "lr": 1.410331890317457e-06, "epoch": 6.441717791411043, "percentage": 64.42, "elapsed_time": "1:12:54", "remaining_time": "0:40:16"} -{"current_steps": 1051, "total_steps": 1630, "loss": 0.0729, "lr": 1.4059973097673187e-06, "epoch": 6.447852760736196, "percentage": 64.48, "elapsed_time": "1:12:56", "remaining_time": "0:40:10"} -{"current_steps": 1052, "total_steps": 1630, "loss": 0.0094, "lr": 1.4016667931151156e-06, "epoch": 6.45398773006135, "percentage": 64.54, "elapsed_time": "1:12:58", "remaining_time": "0:40:05"} -{"current_steps": 1053, "total_steps": 1630, "loss": 0.0078, "lr": 1.3973403564474422e-06, "epoch": 6.460122699386503, "percentage": 64.6, "elapsed_time": "1:13:00", "remaining_time": "0:40:00"} -{"current_steps": 1054, "total_steps": 1630, "loss": 0.0231, "lr": 1.393018015835737e-06, "epoch": 6.466257668711656, "percentage": 64.66, "elapsed_time": "1:13:01", "remaining_time": "0:39:54"} -{"current_steps": 1055, "total_steps": 1630, "loss": 0.0153, "lr": 1.388699787336224e-06, "epoch": 6.47239263803681, "percentage": 64.72, "elapsed_time": "1:13:05", "remaining_time": "0:39:50"} -{"current_steps": 1056, "total_steps": 1630, "loss": 0.0136, "lr": 1.3843856869898486e-06, "epoch": 6.478527607361963, "percentage": 64.79, "elapsed_time": "1:13:08", "remaining_time": "0:39:45"} -{"current_steps": 1057, "total_steps": 1630, "loss": 0.0819, "lr": 1.3800757308222263e-06, "epoch": 6.484662576687117, "percentage": 64.85, "elapsed_time": "1:13:15", "remaining_time": "0:39:42"} -{"current_steps": 1058, "total_steps": 1630, "loss": 0.0658, "lr": 1.3757699348435726e-06, "epoch": 6.49079754601227, "percentage": 64.91, "elapsed_time": "1:13:21", "remaining_time": "0:39:39"} -{"current_steps": 1059, "total_steps": 1630, "loss": 0.0106, "lr": 1.3714683150486534e-06, "epoch": 6.4969325153374236, "percentage": 64.97, "elapsed_time": "1:13:23", "remaining_time": "0:39:34"} -{"current_steps": 1060, "total_steps": 1630, "loss": 0.0151, "lr": 1.3671708874167211e-06, "epoch": 6.5030674846625764, "percentage": 65.03, "elapsed_time": "1:13:25", "remaining_time": "0:39:29"} -{"current_steps": 1061, "total_steps": 1630, "loss": 0.0114, "lr": 1.3628776679114516e-06, "epoch": 6.50920245398773, "percentage": 65.09, "elapsed_time": "1:13:27", "remaining_time": "0:39:23"} -{"current_steps": 1062, "total_steps": 1630, "loss": 0.0117, "lr": 1.3585886724808934e-06, "epoch": 6.515337423312883, "percentage": 65.15, "elapsed_time": "1:13:30", "remaining_time": "0:39:19"} -{"current_steps": 1063, "total_steps": 1630, "loss": 0.0381, "lr": 1.3543039170574022e-06, "epoch": 6.521472392638037, "percentage": 65.21, "elapsed_time": "1:13:33", "remaining_time": "0:39:14"} -{"current_steps": 1064, "total_steps": 1630, "loss": 0.0072, "lr": 1.350023417557581e-06, "epoch": 6.52760736196319, "percentage": 65.28, "elapsed_time": "1:13:37", "remaining_time": "0:39:09"} -{"current_steps": 1065, "total_steps": 1630, "loss": 0.0302, "lr": 1.345747189882228e-06, "epoch": 6.533742331288344, "percentage": 65.34, "elapsed_time": "1:13:41", "remaining_time": "0:39:05"} -{"current_steps": 1066, "total_steps": 1630, "loss": 0.0095, "lr": 1.3414752499162676e-06, "epoch": 6.539877300613497, "percentage": 65.4, "elapsed_time": "1:13:43", "remaining_time": "0:39:00"} -{"current_steps": 1067, "total_steps": 1630, "loss": 0.067, "lr": 1.3372076135287005e-06, "epoch": 6.54601226993865, "percentage": 65.46, "elapsed_time": "1:13:46", "remaining_time": "0:38:55"} -{"current_steps": 1068, "total_steps": 1630, "loss": 0.0203, "lr": 1.33294429657254e-06, "epoch": 6.552147239263804, "percentage": 65.52, "elapsed_time": "1:13:51", "remaining_time": "0:38:52"} -{"current_steps": 1069, "total_steps": 1630, "loss": 0.0217, "lr": 1.3286853148847523e-06, "epoch": 6.558282208588957, "percentage": 65.58, "elapsed_time": "1:13:53", "remaining_time": "0:38:46"} -{"current_steps": 1070, "total_steps": 1630, "loss": 0.0223, "lr": 1.3244306842862007e-06, "epoch": 6.564417177914111, "percentage": 65.64, "elapsed_time": "1:13:55", "remaining_time": "0:38:41"} -{"current_steps": 1071, "total_steps": 1630, "loss": 0.027, "lr": 1.3201804205815872e-06, "epoch": 6.570552147239264, "percentage": 65.71, "elapsed_time": "1:13:59", "remaining_time": "0:38:37"} -{"current_steps": 1072, "total_steps": 1630, "loss": 0.0212, "lr": 1.3159345395593876e-06, "epoch": 6.576687116564417, "percentage": 65.77, "elapsed_time": "1:14:04", "remaining_time": "0:38:33"} -{"current_steps": 1073, "total_steps": 1630, "loss": 0.0182, "lr": 1.3116930569918024e-06, "epoch": 6.58282208588957, "percentage": 65.83, "elapsed_time": "1:14:05", "remaining_time": "0:38:27"} -{"current_steps": 1074, "total_steps": 1630, "loss": 0.1187, "lr": 1.3074559886346886e-06, "epoch": 6.588957055214724, "percentage": 65.89, "elapsed_time": "1:14:08", "remaining_time": "0:38:23"} -{"current_steps": 1075, "total_steps": 1630, "loss": 0.0103, "lr": 1.3032233502275089e-06, "epoch": 6.595092024539877, "percentage": 65.95, "elapsed_time": "1:14:09", "remaining_time": "0:38:17"} -{"current_steps": 1076, "total_steps": 1630, "loss": 0.0115, "lr": 1.2989951574932693e-06, "epoch": 6.601226993865031, "percentage": 66.01, "elapsed_time": "1:14:10", "remaining_time": "0:38:11"} -{"current_steps": 1077, "total_steps": 1630, "loss": 0.0155, "lr": 1.2947714261384602e-06, "epoch": 6.6073619631901845, "percentage": 66.07, "elapsed_time": "1:14:13", "remaining_time": "0:38:06"} -{"current_steps": 1078, "total_steps": 1630, "loss": 0.0125, "lr": 1.2905521718530012e-06, "epoch": 6.613496932515337, "percentage": 66.13, "elapsed_time": "1:14:18", "remaining_time": "0:38:02"} -{"current_steps": 1079, "total_steps": 1630, "loss": 0.0181, "lr": 1.2863374103101784e-06, "epoch": 6.61963190184049, "percentage": 66.2, "elapsed_time": "1:14:22", "remaining_time": "0:37:58"} -{"current_steps": 1080, "total_steps": 1630, "loss": 0.0102, "lr": 1.2821271571665912e-06, "epoch": 6.625766871165644, "percentage": 66.26, "elapsed_time": "1:14:24", "remaining_time": "0:37:53"} -{"current_steps": 1081, "total_steps": 1630, "loss": 0.0969, "lr": 1.277921428062091e-06, "epoch": 6.631901840490798, "percentage": 66.32, "elapsed_time": "1:14:28", "remaining_time": "0:37:49"} -{"current_steps": 1082, "total_steps": 1630, "loss": 0.0383, "lr": 1.2737202386197222e-06, "epoch": 6.638036809815951, "percentage": 66.38, "elapsed_time": "1:14:30", "remaining_time": "0:37:44"} -{"current_steps": 1083, "total_steps": 1630, "loss": 0.0184, "lr": 1.2695236044456672e-06, "epoch": 6.644171779141105, "percentage": 66.44, "elapsed_time": "1:14:33", "remaining_time": "0:37:39"} -{"current_steps": 1084, "total_steps": 1630, "loss": 0.0327, "lr": 1.2653315411291867e-06, "epoch": 6.6503067484662575, "percentage": 66.5, "elapsed_time": "1:14:36", "remaining_time": "0:37:34"} -{"current_steps": 1085, "total_steps": 1630, "loss": 0.0399, "lr": 1.2611440642425617e-06, "epoch": 6.656441717791411, "percentage": 66.56, "elapsed_time": "1:14:41", "remaining_time": "0:37:31"} -{"current_steps": 1086, "total_steps": 1630, "loss": 0.0385, "lr": 1.2569611893410374e-06, "epoch": 6.662576687116564, "percentage": 66.63, "elapsed_time": "1:14:43", "remaining_time": "0:37:25"} -{"current_steps": 1087, "total_steps": 1630, "loss": 0.0123, "lr": 1.2527829319627604e-06, "epoch": 6.668711656441718, "percentage": 66.69, "elapsed_time": "1:14:45", "remaining_time": "0:37:20"} -{"current_steps": 1088, "total_steps": 1630, "loss": 0.0302, "lr": 1.248609307628729e-06, "epoch": 6.674846625766871, "percentage": 66.75, "elapsed_time": "1:14:47", "remaining_time": "0:37:15"} -{"current_steps": 1089, "total_steps": 1630, "loss": 0.0296, "lr": 1.2444403318427268e-06, "epoch": 6.680981595092025, "percentage": 66.81, "elapsed_time": "1:14:49", "remaining_time": "0:37:10"} -{"current_steps": 1090, "total_steps": 1630, "loss": 0.1532, "lr": 1.2402760200912725e-06, "epoch": 6.6871165644171775, "percentage": 66.87, "elapsed_time": "1:14:51", "remaining_time": "0:37:04"} -{"current_steps": 1091, "total_steps": 1630, "loss": 0.0126, "lr": 1.2361163878435594e-06, "epoch": 6.693251533742331, "percentage": 66.93, "elapsed_time": "1:14:52", "remaining_time": "0:36:59"} -{"current_steps": 1092, "total_steps": 1630, "loss": 0.0086, "lr": 1.2319614505513953e-06, "epoch": 6.699386503067485, "percentage": 66.99, "elapsed_time": "1:14:54", "remaining_time": "0:36:54"} -{"current_steps": 1093, "total_steps": 1630, "loss": 0.0041, "lr": 1.227811223649149e-06, "epoch": 6.705521472392638, "percentage": 67.06, "elapsed_time": "1:14:55", "remaining_time": "0:36:48"} -{"current_steps": 1094, "total_steps": 1630, "loss": 0.0103, "lr": 1.2236657225536938e-06, "epoch": 6.711656441717792, "percentage": 67.12, "elapsed_time": "1:14:59", "remaining_time": "0:36:44"} -{"current_steps": 1095, "total_steps": 1630, "loss": 0.0063, "lr": 1.2195249626643432e-06, "epoch": 6.717791411042945, "percentage": 67.18, "elapsed_time": "1:15:00", "remaining_time": "0:36:39"} -{"current_steps": 1096, "total_steps": 1630, "loss": 0.0571, "lr": 1.2153889593628032e-06, "epoch": 6.723926380368098, "percentage": 67.24, "elapsed_time": "1:15:02", "remaining_time": "0:36:33"} -{"current_steps": 1097, "total_steps": 1630, "loss": 0.0269, "lr": 1.211257728013107e-06, "epoch": 6.730061349693251, "percentage": 67.3, "elapsed_time": "1:15:04", "remaining_time": "0:36:28"} -{"current_steps": 1098, "total_steps": 1630, "loss": 0.0396, "lr": 1.2071312839615634e-06, "epoch": 6.736196319018405, "percentage": 67.36, "elapsed_time": "1:15:06", "remaining_time": "0:36:23"} -{"current_steps": 1099, "total_steps": 1630, "loss": 0.0261, "lr": 1.2030096425366985e-06, "epoch": 6.742331288343558, "percentage": 67.42, "elapsed_time": "1:15:07", "remaining_time": "0:36:17"} -{"current_steps": 1100, "total_steps": 1630, "loss": 0.013, "lr": 1.1988928190491948e-06, "epoch": 6.748466257668712, "percentage": 67.48, "elapsed_time": "1:15:10", "remaining_time": "0:36:13"} -{"current_steps": 1101, "total_steps": 1630, "loss": 0.0113, "lr": 1.1947808287918406e-06, "epoch": 6.754601226993865, "percentage": 67.55, "elapsed_time": "1:15:12", "remaining_time": "0:36:07"} -{"current_steps": 1102, "total_steps": 1630, "loss": 0.0195, "lr": 1.19067368703947e-06, "epoch": 6.7607361963190185, "percentage": 67.61, "elapsed_time": "1:15:13", "remaining_time": "0:36:02"} -{"current_steps": 1103, "total_steps": 1630, "loss": 0.0105, "lr": 1.1865714090489038e-06, "epoch": 6.766871165644172, "percentage": 67.67, "elapsed_time": "1:15:16", "remaining_time": "0:35:58"} -{"current_steps": 1104, "total_steps": 1630, "loss": 0.0554, "lr": 1.1824740100588991e-06, "epoch": 6.773006134969325, "percentage": 67.73, "elapsed_time": "1:15:24", "remaining_time": "0:35:55"} -{"current_steps": 1105, "total_steps": 1630, "loss": 0.0118, "lr": 1.1783815052900848e-06, "epoch": 6.779141104294479, "percentage": 67.79, "elapsed_time": "1:15:26", "remaining_time": "0:35:50"} -{"current_steps": 1106, "total_steps": 1630, "loss": 0.0901, "lr": 1.1742939099449126e-06, "epoch": 6.785276073619632, "percentage": 67.85, "elapsed_time": "1:15:32", "remaining_time": "0:35:47"} -{"current_steps": 1107, "total_steps": 1630, "loss": 0.0833, "lr": 1.1702112392075966e-06, "epoch": 6.791411042944786, "percentage": 67.91, "elapsed_time": "1:15:35", "remaining_time": "0:35:42"} -{"current_steps": 1108, "total_steps": 1630, "loss": 0.0078, "lr": 1.1661335082440545e-06, "epoch": 6.7975460122699385, "percentage": 67.98, "elapsed_time": "1:15:39", "remaining_time": "0:35:38"} -{"current_steps": 1109, "total_steps": 1630, "loss": 0.0092, "lr": 1.1620607322018587e-06, "epoch": 6.803680981595092, "percentage": 68.04, "elapsed_time": "1:15:41", "remaining_time": "0:35:33"} -{"current_steps": 1110, "total_steps": 1630, "loss": 0.0283, "lr": 1.1579929262101712e-06, "epoch": 6.809815950920245, "percentage": 68.1, "elapsed_time": "1:15:44", "remaining_time": "0:35:28"} -{"current_steps": 1111, "total_steps": 1630, "loss": 0.0066, "lr": 1.153930105379695e-06, "epoch": 6.815950920245399, "percentage": 68.16, "elapsed_time": "1:15:46", "remaining_time": "0:35:23"} -{"current_steps": 1112, "total_steps": 1630, "loss": 0.0402, "lr": 1.1498722848026142e-06, "epoch": 6.822085889570552, "percentage": 68.22, "elapsed_time": "1:15:52", "remaining_time": "0:35:20"} -{"current_steps": 1113, "total_steps": 1630, "loss": 0.0101, "lr": 1.1458194795525354e-06, "epoch": 6.828220858895706, "percentage": 68.28, "elapsed_time": "1:15:55", "remaining_time": "0:35:16"} -{"current_steps": 1114, "total_steps": 1630, "loss": 0.0109, "lr": 1.1417717046844385e-06, "epoch": 6.8343558282208585, "percentage": 68.34, "elapsed_time": "1:15:57", "remaining_time": "0:35:11"} -{"current_steps": 1115, "total_steps": 1630, "loss": 0.0297, "lr": 1.137728975234615e-06, "epoch": 6.840490797546012, "percentage": 68.4, "elapsed_time": "1:16:02", "remaining_time": "0:35:07"} -{"current_steps": 1116, "total_steps": 1630, "loss": 0.0393, "lr": 1.1336913062206157e-06, "epoch": 6.846625766871165, "percentage": 68.47, "elapsed_time": "1:16:04", "remaining_time": "0:35:02"} -{"current_steps": 1117, "total_steps": 1630, "loss": 0.0269, "lr": 1.129658712641192e-06, "epoch": 6.852760736196319, "percentage": 68.53, "elapsed_time": "1:16:06", "remaining_time": "0:34:57"} -{"current_steps": 1118, "total_steps": 1630, "loss": 0.0708, "lr": 1.125631209476241e-06, "epoch": 6.858895705521473, "percentage": 68.59, "elapsed_time": "1:16:12", "remaining_time": "0:34:54"} -{"current_steps": 1119, "total_steps": 1630, "loss": 0.0835, "lr": 1.1216088116867524e-06, "epoch": 6.865030674846626, "percentage": 68.65, "elapsed_time": "1:16:15", "remaining_time": "0:34:49"} -{"current_steps": 1120, "total_steps": 1630, "loss": 0.0107, "lr": 1.1175915342147486e-06, "epoch": 6.871165644171779, "percentage": 68.71, "elapsed_time": "1:16:19", "remaining_time": "0:34:45"} -{"current_steps": 1121, "total_steps": 1630, "loss": 0.0139, "lr": 1.1135793919832336e-06, "epoch": 6.877300613496932, "percentage": 68.77, "elapsed_time": "1:16:21", "remaining_time": "0:34:40"} -{"current_steps": 1122, "total_steps": 1630, "loss": 0.0154, "lr": 1.1095723998961353e-06, "epoch": 6.883435582822086, "percentage": 68.83, "elapsed_time": "1:16:23", "remaining_time": "0:34:35"} -{"current_steps": 1123, "total_steps": 1630, "loss": 0.0072, "lr": 1.1055705728382482e-06, "epoch": 6.889570552147239, "percentage": 68.9, "elapsed_time": "1:16:26", "remaining_time": "0:34:30"} -{"current_steps": 1124, "total_steps": 1630, "loss": 0.0202, "lr": 1.1015739256751826e-06, "epoch": 6.895705521472393, "percentage": 68.96, "elapsed_time": "1:16:29", "remaining_time": "0:34:26"} -{"current_steps": 1125, "total_steps": 1630, "loss": 0.0559, "lr": 1.0975824732533066e-06, "epoch": 6.901840490797546, "percentage": 69.02, "elapsed_time": "1:16:33", "remaining_time": "0:34:21"} -{"current_steps": 1126, "total_steps": 1630, "loss": 0.0385, "lr": 1.09359623039969e-06, "epoch": 6.9079754601226995, "percentage": 69.08, "elapsed_time": "1:16:37", "remaining_time": "0:34:17"} -{"current_steps": 1127, "total_steps": 1630, "loss": 0.0535, "lr": 1.0896152119220525e-06, "epoch": 6.914110429447852, "percentage": 69.14, "elapsed_time": "1:16:43", "remaining_time": "0:34:14"} -{"current_steps": 1128, "total_steps": 1630, "loss": 0.0104, "lr": 1.0856394326087045e-06, "epoch": 6.920245398773006, "percentage": 69.2, "elapsed_time": "1:16:46", "remaining_time": "0:34:10"} -{"current_steps": 1129, "total_steps": 1630, "loss": 0.0121, "lr": 1.0816689072284962e-06, "epoch": 6.92638036809816, "percentage": 69.26, "elapsed_time": "1:16:51", "remaining_time": "0:34:06"} -{"current_steps": 1130, "total_steps": 1630, "loss": 0.0056, "lr": 1.0777036505307616e-06, "epoch": 6.932515337423313, "percentage": 69.33, "elapsed_time": "1:16:52", "remaining_time": "0:34:00"} -{"current_steps": 1131, "total_steps": 1630, "loss": 0.0198, "lr": 1.0737436772452602e-06, "epoch": 6.938650306748467, "percentage": 69.39, "elapsed_time": "1:16:54", "remaining_time": "0:33:55"} -{"current_steps": 1132, "total_steps": 1630, "loss": 0.0077, "lr": 1.0697890020821292e-06, "epoch": 6.9447852760736195, "percentage": 69.45, "elapsed_time": "1:16:55", "remaining_time": "0:33:50"} -{"current_steps": 1133, "total_steps": 1630, "loss": 0.0329, "lr": 1.0658396397318203e-06, "epoch": 6.950920245398773, "percentage": 69.51, "elapsed_time": "1:16:58", "remaining_time": "0:33:45"} -{"current_steps": 1134, "total_steps": 1630, "loss": 0.0113, "lr": 1.061895604865053e-06, "epoch": 6.957055214723926, "percentage": 69.57, "elapsed_time": "1:17:02", "remaining_time": "0:33:41"} -{"current_steps": 1135, "total_steps": 1630, "loss": 0.0376, "lr": 1.057956912132757e-06, "epoch": 6.96319018404908, "percentage": 69.63, "elapsed_time": "1:17:03", "remaining_time": "0:33:36"} -{"current_steps": 1136, "total_steps": 1630, "loss": 0.0517, "lr": 1.054023576166014e-06, "epoch": 6.969325153374233, "percentage": 69.69, "elapsed_time": "1:17:07", "remaining_time": "0:33:32"} -{"current_steps": 1137, "total_steps": 1630, "loss": 0.0373, "lr": 1.0500956115760105e-06, "epoch": 6.975460122699387, "percentage": 69.75, "elapsed_time": "1:17:09", "remaining_time": "0:33:27"} -{"current_steps": 1138, "total_steps": 1630, "loss": 0.019, "lr": 1.0461730329539794e-06, "epoch": 6.9815950920245395, "percentage": 69.82, "elapsed_time": "1:17:11", "remaining_time": "0:33:22"} -{"current_steps": 1139, "total_steps": 1630, "loss": 0.0073, "lr": 1.0422558548711434e-06, "epoch": 6.987730061349693, "percentage": 69.88, "elapsed_time": "1:17:14", "remaining_time": "0:33:17"} -{"current_steps": 1140, "total_steps": 1630, "loss": 0.0099, "lr": 1.0383440918786684e-06, "epoch": 6.993865030674847, "percentage": 69.94, "elapsed_time": "1:17:16", "remaining_time": "0:33:12"} -{"current_steps": 1141, "total_steps": 1630, "loss": 0.0218, "lr": 1.0344377585076e-06, "epoch": 7.0, "percentage": 70.0, "elapsed_time": "1:17:17", "remaining_time": "0:33:07"} -{"current_steps": 1142, "total_steps": 1630, "loss": 0.0024, "lr": 1.0305368692688175e-06, "epoch": 7.006134969325154, "percentage": 70.06, "elapsed_time": "1:23:54", "remaining_time": "0:35:51"} -{"current_steps": 1143, "total_steps": 1630, "loss": 0.0059, "lr": 1.0266414386529775e-06, "epoch": 7.012269938650307, "percentage": 70.12, "elapsed_time": "1:23:57", "remaining_time": "0:35:46"} -{"current_steps": 1144, "total_steps": 1630, "loss": 0.0843, "lr": 1.0227514811304556e-06, "epoch": 7.0184049079754605, "percentage": 70.18, "elapsed_time": "1:24:00", "remaining_time": "0:35:41"} -{"current_steps": 1145, "total_steps": 1630, "loss": 0.0098, "lr": 1.0188670111513002e-06, "epoch": 7.024539877300613, "percentage": 70.25, "elapsed_time": "1:24:06", "remaining_time": "0:35:37"} -{"current_steps": 1146, "total_steps": 1630, "loss": 0.0042, "lr": 1.0149880431451736e-06, "epoch": 7.030674846625767, "percentage": 70.31, "elapsed_time": "1:24:08", "remaining_time": "0:35:32"} -{"current_steps": 1147, "total_steps": 1630, "loss": 0.003, "lr": 1.0111145915213e-06, "epoch": 7.03680981595092, "percentage": 70.37, "elapsed_time": "1:24:12", "remaining_time": "0:35:27"} -{"current_steps": 1148, "total_steps": 1630, "loss": 0.0027, "lr": 1.0072466706684127e-06, "epoch": 7.042944785276074, "percentage": 70.43, "elapsed_time": "1:24:15", "remaining_time": "0:35:22"} -{"current_steps": 1149, "total_steps": 1630, "loss": 0.0105, "lr": 1.0033842949546974e-06, "epoch": 7.049079754601227, "percentage": 70.49, "elapsed_time": "1:24:17", "remaining_time": "0:35:17"} -{"current_steps": 1150, "total_steps": 1630, "loss": 0.0233, "lr": 9.995274787277445e-07, "epoch": 7.0552147239263805, "percentage": 70.55, "elapsed_time": "1:24:19", "remaining_time": "0:35:11"} -{"current_steps": 1151, "total_steps": 1630, "loss": 0.0031, "lr": 9.956762363144892e-07, "epoch": 7.061349693251533, "percentage": 70.61, "elapsed_time": "1:24:21", "remaining_time": "0:35:06"} -{"current_steps": 1152, "total_steps": 1630, "loss": 0.0047, "lr": 9.918305820211643e-07, "epoch": 7.067484662576687, "percentage": 70.67, "elapsed_time": "1:24:23", "remaining_time": "0:35:00"} -{"current_steps": 1153, "total_steps": 1630, "loss": 0.0334, "lr": 9.879905301332439e-07, "epoch": 7.07361963190184, "percentage": 70.74, "elapsed_time": "1:24:26", "remaining_time": "0:34:55"} -{"current_steps": 1154, "total_steps": 1630, "loss": 0.0191, "lr": 9.84156094915389e-07, "epoch": 7.079754601226994, "percentage": 70.8, "elapsed_time": "1:24:32", "remaining_time": "0:34:52"} -{"current_steps": 1155, "total_steps": 1630, "loss": 0.0045, "lr": 9.803272906113978e-07, "epoch": 7.085889570552148, "percentage": 70.86, "elapsed_time": "1:24:35", "remaining_time": "0:34:47"} -{"current_steps": 1156, "total_steps": 1630, "loss": 0.0042, "lr": 9.765041314441529e-07, "epoch": 7.0920245398773005, "percentage": 70.92, "elapsed_time": "1:24:38", "remaining_time": "0:34:42"} -{"current_steps": 1157, "total_steps": 1630, "loss": 0.0066, "lr": 9.72686631615563e-07, "epoch": 7.098159509202454, "percentage": 70.98, "elapsed_time": "1:24:40", "remaining_time": "0:34:36"} -{"current_steps": 1158, "total_steps": 1630, "loss": 0.0058, "lr": 9.688748053065179e-07, "epoch": 7.104294478527607, "percentage": 71.04, "elapsed_time": "1:24:43", "remaining_time": "0:34:32"} -{"current_steps": 1159, "total_steps": 1630, "loss": 0.0067, "lr": 9.65068666676828e-07, "epoch": 7.110429447852761, "percentage": 71.1, "elapsed_time": "1:24:44", "remaining_time": "0:34:26"} -{"current_steps": 1160, "total_steps": 1630, "loss": 0.0052, "lr": 9.612682298651792e-07, "epoch": 7.116564417177914, "percentage": 71.17, "elapsed_time": "1:24:46", "remaining_time": "0:34:20"} -{"current_steps": 1161, "total_steps": 1630, "loss": 0.0035, "lr": 9.574735089890765e-07, "epoch": 7.122699386503068, "percentage": 71.23, "elapsed_time": "1:24:49", "remaining_time": "0:34:15"} -{"current_steps": 1162, "total_steps": 1630, "loss": 0.0126, "lr": 9.53684518144789e-07, "epoch": 7.128834355828221, "percentage": 71.29, "elapsed_time": "1:24:52", "remaining_time": "0:34:10"} -{"current_steps": 1163, "total_steps": 1630, "loss": 0.0345, "lr": 9.499012714073036e-07, "epoch": 7.134969325153374, "percentage": 71.35, "elapsed_time": "1:24:55", "remaining_time": "0:34:06"} -{"current_steps": 1164, "total_steps": 1630, "loss": 0.0144, "lr": 9.461237828302666e-07, "epoch": 7.141104294478527, "percentage": 71.41, "elapsed_time": "1:25:00", "remaining_time": "0:34:02"} -{"current_steps": 1165, "total_steps": 1630, "loss": 0.0135, "lr": 9.423520664459374e-07, "epoch": 7.147239263803681, "percentage": 71.47, "elapsed_time": "1:25:03", "remaining_time": "0:33:56"} -{"current_steps": 1166, "total_steps": 1630, "loss": 0.0138, "lr": 9.385861362651322e-07, "epoch": 7.153374233128835, "percentage": 71.53, "elapsed_time": "1:25:04", "remaining_time": "0:33:51"} -{"current_steps": 1167, "total_steps": 1630, "loss": 0.0093, "lr": 9.348260062771713e-07, "epoch": 7.159509202453988, "percentage": 71.6, "elapsed_time": "1:25:06", "remaining_time": "0:33:45"} -{"current_steps": 1168, "total_steps": 1630, "loss": 0.003, "lr": 9.310716904498321e-07, "epoch": 7.1656441717791415, "percentage": 71.66, "elapsed_time": "1:25:08", "remaining_time": "0:33:40"} -{"current_steps": 1169, "total_steps": 1630, "loss": 0.0033, "lr": 9.273232027292933e-07, "epoch": 7.171779141104294, "percentage": 71.72, "elapsed_time": "1:25:10", "remaining_time": "0:33:35"} -{"current_steps": 1170, "total_steps": 1630, "loss": 0.0024, "lr": 9.235805570400813e-07, "epoch": 7.177914110429448, "percentage": 71.78, "elapsed_time": "1:25:13", "remaining_time": "0:33:30"} -{"current_steps": 1171, "total_steps": 1630, "loss": 0.0118, "lr": 9.198437672850249e-07, "epoch": 7.184049079754601, "percentage": 71.84, "elapsed_time": "1:25:16", "remaining_time": "0:33:25"} -{"current_steps": 1172, "total_steps": 1630, "loss": 0.0173, "lr": 9.161128473451967e-07, "epoch": 7.190184049079755, "percentage": 71.9, "elapsed_time": "1:25:17", "remaining_time": "0:33:19"} -{"current_steps": 1173, "total_steps": 1630, "loss": 0.0142, "lr": 9.123878110798662e-07, "epoch": 7.196319018404908, "percentage": 71.96, "elapsed_time": "1:25:19", "remaining_time": "0:33:14"} -{"current_steps": 1174, "total_steps": 1630, "loss": 0.012, "lr": 9.086686723264474e-07, "epoch": 7.2024539877300615, "percentage": 72.02, "elapsed_time": "1:25:21", "remaining_time": "0:33:09"} -{"current_steps": 1175, "total_steps": 1630, "loss": 0.0055, "lr": 9.049554449004447e-07, "epoch": 7.208588957055214, "percentage": 72.09, "elapsed_time": "1:25:24", "remaining_time": "0:33:04"} -{"current_steps": 1176, "total_steps": 1630, "loss": 0.0043, "lr": 9.012481425954053e-07, "epoch": 7.214723926380368, "percentage": 72.15, "elapsed_time": "1:25:27", "remaining_time": "0:32:59"} -{"current_steps": 1177, "total_steps": 1630, "loss": 0.0443, "lr": 8.97546779182866e-07, "epoch": 7.220858895705521, "percentage": 72.21, "elapsed_time": "1:25:31", "remaining_time": "0:32:54"} -{"current_steps": 1178, "total_steps": 1630, "loss": 0.0082, "lr": 8.938513684123024e-07, "epoch": 7.226993865030675, "percentage": 72.27, "elapsed_time": "1:25:33", "remaining_time": "0:32:49"} -{"current_steps": 1179, "total_steps": 1630, "loss": 0.0071, "lr": 8.901619240110781e-07, "epoch": 7.233128834355828, "percentage": 72.33, "elapsed_time": "1:25:35", "remaining_time": "0:32:44"} -{"current_steps": 1180, "total_steps": 1630, "loss": 0.0056, "lr": 8.864784596843917e-07, "epoch": 7.2392638036809815, "percentage": 72.39, "elapsed_time": "1:25:36", "remaining_time": "0:32:39"} -{"current_steps": 1181, "total_steps": 1630, "loss": 0.0076, "lr": 8.828009891152301e-07, "epoch": 7.245398773006135, "percentage": 72.45, "elapsed_time": "1:25:38", "remaining_time": "0:32:33"} -{"current_steps": 1182, "total_steps": 1630, "loss": 0.0141, "lr": 8.791295259643126e-07, "epoch": 7.251533742331288, "percentage": 72.52, "elapsed_time": "1:25:40", "remaining_time": "0:32:28"} -{"current_steps": 1183, "total_steps": 1630, "loss": 0.01, "lr": 8.754640838700443e-07, "epoch": 7.257668711656442, "percentage": 72.58, "elapsed_time": "1:25:42", "remaining_time": "0:32:23"} -{"current_steps": 1184, "total_steps": 1630, "loss": 0.009, "lr": 8.718046764484648e-07, "epoch": 7.263803680981595, "percentage": 72.64, "elapsed_time": "1:25:47", "remaining_time": "0:32:19"} -{"current_steps": 1185, "total_steps": 1630, "loss": 0.0291, "lr": 8.681513172931935e-07, "epoch": 7.269938650306749, "percentage": 72.7, "elapsed_time": "1:25:49", "remaining_time": "0:32:13"} -{"current_steps": 1186, "total_steps": 1630, "loss": 0.0064, "lr": 8.64504019975386e-07, "epoch": 7.276073619631902, "percentage": 72.76, "elapsed_time": "1:25:51", "remaining_time": "0:32:08"} -{"current_steps": 1187, "total_steps": 1630, "loss": 0.0135, "lr": 8.608627980436765e-07, "epoch": 7.282208588957055, "percentage": 72.82, "elapsed_time": "1:25:53", "remaining_time": "0:32:03"} -{"current_steps": 1188, "total_steps": 1630, "loss": 0.0061, "lr": 8.572276650241329e-07, "epoch": 7.288343558282208, "percentage": 72.88, "elapsed_time": "1:25:56", "remaining_time": "0:31:58"} -{"current_steps": 1189, "total_steps": 1630, "loss": 0.0051, "lr": 8.535986344202057e-07, "epoch": 7.294478527607362, "percentage": 72.94, "elapsed_time": "1:26:00", "remaining_time": "0:31:54"} -{"current_steps": 1190, "total_steps": 1630, "loss": 0.0052, "lr": 8.499757197126732e-07, "epoch": 7.300613496932515, "percentage": 73.01, "elapsed_time": "1:26:02", "remaining_time": "0:31:48"} -{"current_steps": 1191, "total_steps": 1630, "loss": 0.0111, "lr": 8.463589343595976e-07, "epoch": 7.306748466257669, "percentage": 73.07, "elapsed_time": "1:26:08", "remaining_time": "0:31:45"} -{"current_steps": 1192, "total_steps": 1630, "loss": 0.0279, "lr": 8.427482917962734e-07, "epoch": 7.3128834355828225, "percentage": 73.13, "elapsed_time": "1:26:12", "remaining_time": "0:31:40"} -{"current_steps": 1193, "total_steps": 1630, "loss": 0.0105, "lr": 8.391438054351725e-07, "epoch": 7.319018404907975, "percentage": 73.19, "elapsed_time": "1:26:15", "remaining_time": "0:31:35"} -{"current_steps": 1194, "total_steps": 1630, "loss": 0.0028, "lr": 8.355454886659026e-07, "epoch": 7.325153374233129, "percentage": 73.25, "elapsed_time": "1:26:16", "remaining_time": "0:31:30"} -{"current_steps": 1195, "total_steps": 1630, "loss": 0.0102, "lr": 8.319533548551492e-07, "epoch": 7.331288343558282, "percentage": 73.31, "elapsed_time": "1:26:20", "remaining_time": "0:31:25"} -{"current_steps": 1196, "total_steps": 1630, "loss": 0.0396, "lr": 8.28367417346633e-07, "epoch": 7.337423312883436, "percentage": 73.37, "elapsed_time": "1:26:23", "remaining_time": "0:31:20"} -{"current_steps": 1197, "total_steps": 1630, "loss": 0.006, "lr": 8.247876894610568e-07, "epoch": 7.343558282208589, "percentage": 73.44, "elapsed_time": "1:26:25", "remaining_time": "0:31:15"} -{"current_steps": 1198, "total_steps": 1630, "loss": 0.0075, "lr": 8.212141844960544e-07, "epoch": 7.3496932515337425, "percentage": 73.5, "elapsed_time": "1:26:29", "remaining_time": "0:31:11"} -{"current_steps": 1199, "total_steps": 1630, "loss": 0.0042, "lr": 8.17646915726146e-07, "epoch": 7.355828220858895, "percentage": 73.56, "elapsed_time": "1:26:31", "remaining_time": "0:31:06"} -{"current_steps": 1200, "total_steps": 1630, "loss": 0.0032, "lr": 8.140858964026849e-07, "epoch": 7.361963190184049, "percentage": 73.62, "elapsed_time": "1:26:34", "remaining_time": "0:31:01"} -{"current_steps": 1201, "total_steps": 1630, "loss": 0.032, "lr": 8.105311397538085e-07, "epoch": 7.368098159509202, "percentage": 73.68, "elapsed_time": "1:26:38", "remaining_time": "0:30:56"} -{"current_steps": 1202, "total_steps": 1630, "loss": 0.0185, "lr": 8.069826589843929e-07, "epoch": 7.374233128834356, "percentage": 73.74, "elapsed_time": "1:26:41", "remaining_time": "0:30:52"} -{"current_steps": 1203, "total_steps": 1630, "loss": 0.0034, "lr": 8.034404672759977e-07, "epoch": 7.38036809815951, "percentage": 73.8, "elapsed_time": "1:26:44", "remaining_time": "0:30:47"} -{"current_steps": 1204, "total_steps": 1630, "loss": 0.034, "lr": 7.99904577786823e-07, "epoch": 7.386503067484663, "percentage": 73.87, "elapsed_time": "1:26:50", "remaining_time": "0:30:43"} -{"current_steps": 1205, "total_steps": 1630, "loss": 0.005, "lr": 7.963750036516585e-07, "epoch": 7.392638036809816, "percentage": 73.93, "elapsed_time": "1:26:52", "remaining_time": "0:30:38"} -{"current_steps": 1206, "total_steps": 1630, "loss": 0.0073, "lr": 7.928517579818312e-07, "epoch": 7.398773006134969, "percentage": 73.99, "elapsed_time": "1:26:55", "remaining_time": "0:30:33"} -{"current_steps": 1207, "total_steps": 1630, "loss": 0.015, "lr": 7.893348538651635e-07, "epoch": 7.404907975460123, "percentage": 74.05, "elapsed_time": "1:27:02", "remaining_time": "0:30:30"} -{"current_steps": 1208, "total_steps": 1630, "loss": 0.004, "lr": 7.858243043659161e-07, "epoch": 7.411042944785276, "percentage": 74.11, "elapsed_time": "1:27:04", "remaining_time": "0:30:25"} -{"current_steps": 1209, "total_steps": 1630, "loss": 0.003, "lr": 7.823201225247496e-07, "epoch": 7.41717791411043, "percentage": 74.17, "elapsed_time": "1:27:05", "remaining_time": "0:30:19"} -{"current_steps": 1210, "total_steps": 1630, "loss": 0.0096, "lr": 7.788223213586677e-07, "epoch": 7.423312883435583, "percentage": 74.23, "elapsed_time": "1:27:08", "remaining_time": "0:30:14"} -{"current_steps": 1211, "total_steps": 1630, "loss": 0.006, "lr": 7.753309138609705e-07, "epoch": 7.429447852760736, "percentage": 74.29, "elapsed_time": "1:27:13", "remaining_time": "0:30:10"} -{"current_steps": 1212, "total_steps": 1630, "loss": 0.0074, "lr": 7.71845913001211e-07, "epoch": 7.435582822085889, "percentage": 74.36, "elapsed_time": "1:27:16", "remaining_time": "0:30:05"} -{"current_steps": 1213, "total_steps": 1630, "loss": 0.0051, "lr": 7.683673317251392e-07, "epoch": 7.441717791411043, "percentage": 74.42, "elapsed_time": "1:27:18", "remaining_time": "0:30:00"} -{"current_steps": 1214, "total_steps": 1630, "loss": 0.0271, "lr": 7.648951829546619e-07, "epoch": 7.447852760736196, "percentage": 74.48, "elapsed_time": "1:27:25", "remaining_time": "0:29:57"} -{"current_steps": 1215, "total_steps": 1630, "loss": 0.0155, "lr": 7.6142947958779e-07, "epoch": 7.45398773006135, "percentage": 74.54, "elapsed_time": "1:27:28", "remaining_time": "0:29:52"} -{"current_steps": 1216, "total_steps": 1630, "loss": 0.0032, "lr": 7.579702344985899e-07, "epoch": 7.460122699386503, "percentage": 74.6, "elapsed_time": "1:27:30", "remaining_time": "0:29:47"} -{"current_steps": 1217, "total_steps": 1630, "loss": 0.0037, "lr": 7.545174605371403e-07, "epoch": 7.466257668711656, "percentage": 74.66, "elapsed_time": "1:27:31", "remaining_time": "0:29:42"} -{"current_steps": 1218, "total_steps": 1630, "loss": 0.0064, "lr": 7.510711705294782e-07, "epoch": 7.47239263803681, "percentage": 74.72, "elapsed_time": "1:27:36", "remaining_time": "0:29:37"} -{"current_steps": 1219, "total_steps": 1630, "loss": 0.0055, "lr": 7.476313772775578e-07, "epoch": 7.478527607361963, "percentage": 74.79, "elapsed_time": "1:27:38", "remaining_time": "0:29:33"} -{"current_steps": 1220, "total_steps": 1630, "loss": 0.0049, "lr": 7.441980935591986e-07, "epoch": 7.484662576687117, "percentage": 74.85, "elapsed_time": "1:27:42", "remaining_time": "0:29:28"} -{"current_steps": 1221, "total_steps": 1630, "loss": 0.0123, "lr": 7.407713321280377e-07, "epoch": 7.49079754601227, "percentage": 74.91, "elapsed_time": "1:27:46", "remaining_time": "0:29:23"} -{"current_steps": 1222, "total_steps": 1630, "loss": 0.0056, "lr": 7.373511057134855e-07, "epoch": 7.4969325153374236, "percentage": 74.97, "elapsed_time": "1:27:48", "remaining_time": "0:29:19"} -{"current_steps": 1223, "total_steps": 1630, "loss": 0.0155, "lr": 7.339374270206772e-07, "epoch": 7.5030674846625764, "percentage": 75.03, "elapsed_time": "1:27:50", "remaining_time": "0:29:13"} -{"current_steps": 1224, "total_steps": 1630, "loss": 0.0303, "lr": 7.305303087304227e-07, "epoch": 7.50920245398773, "percentage": 75.09, "elapsed_time": "1:27:57", "remaining_time": "0:29:10"} -{"current_steps": 1225, "total_steps": 1630, "loss": 0.0018, "lr": 7.271297634991651e-07, "epoch": 7.515337423312883, "percentage": 75.15, "elapsed_time": "1:27:59", "remaining_time": "0:29:05"} -{"current_steps": 1226, "total_steps": 1630, "loss": 0.0064, "lr": 7.237358039589271e-07, "epoch": 7.521472392638037, "percentage": 75.21, "elapsed_time": "1:28:03", "remaining_time": "0:29:01"} -{"current_steps": 1227, "total_steps": 1630, "loss": 0.0025, "lr": 7.203484427172702e-07, "epoch": 7.52760736196319, "percentage": 75.28, "elapsed_time": "1:28:05", "remaining_time": "0:28:55"} -{"current_steps": 1228, "total_steps": 1630, "loss": 0.0067, "lr": 7.169676923572447e-07, "epoch": 7.533742331288344, "percentage": 75.34, "elapsed_time": "1:28:06", "remaining_time": "0:28:50"} -{"current_steps": 1229, "total_steps": 1630, "loss": 0.0082, "lr": 7.135935654373416e-07, "epoch": 7.539877300613497, "percentage": 75.4, "elapsed_time": "1:28:10", "remaining_time": "0:28:46"} -{"current_steps": 1230, "total_steps": 1630, "loss": 0.0042, "lr": 7.102260744914499e-07, "epoch": 7.54601226993865, "percentage": 75.46, "elapsed_time": "1:28:14", "remaining_time": "0:28:41"} -{"current_steps": 1231, "total_steps": 1630, "loss": 0.0374, "lr": 7.068652320288081e-07, "epoch": 7.552147239263804, "percentage": 75.52, "elapsed_time": "1:28:21", "remaining_time": "0:28:38"} -{"current_steps": 1232, "total_steps": 1630, "loss": 0.0022, "lr": 7.035110505339546e-07, "epoch": 7.558282208588957, "percentage": 75.58, "elapsed_time": "1:28:24", "remaining_time": "0:28:33"} -{"current_steps": 1233, "total_steps": 1630, "loss": 0.006, "lr": 7.001635424666878e-07, "epoch": 7.564417177914111, "percentage": 75.64, "elapsed_time": "1:28:28", "remaining_time": "0:28:29"} -{"current_steps": 1234, "total_steps": 1630, "loss": 0.0137, "lr": 6.968227202620137e-07, "epoch": 7.570552147239264, "percentage": 75.71, "elapsed_time": "1:28:31", "remaining_time": "0:28:24"} -{"current_steps": 1235, "total_steps": 1630, "loss": 0.0216, "lr": 6.934885963301033e-07, "epoch": 7.576687116564417, "percentage": 75.77, "elapsed_time": "1:28:37", "remaining_time": "0:28:20"} -{"current_steps": 1236, "total_steps": 1630, "loss": 0.0027, "lr": 6.901611830562469e-07, "epoch": 7.58282208588957, "percentage": 75.83, "elapsed_time": "1:28:40", "remaining_time": "0:28:15"} -{"current_steps": 1237, "total_steps": 1630, "loss": 0.0391, "lr": 6.868404928008035e-07, "epoch": 7.588957055214724, "percentage": 75.89, "elapsed_time": "1:28:43", "remaining_time": "0:28:11"} -{"current_steps": 1238, "total_steps": 1630, "loss": 0.0053, "lr": 6.835265378991613e-07, "epoch": 7.595092024539877, "percentage": 75.95, "elapsed_time": "1:28:46", "remaining_time": "0:28:06"} -{"current_steps": 1239, "total_steps": 1630, "loss": 0.0395, "lr": 6.802193306616858e-07, "epoch": 7.601226993865031, "percentage": 76.01, "elapsed_time": "1:28:49", "remaining_time": "0:28:01"} -{"current_steps": 1240, "total_steps": 1630, "loss": 0.0055, "lr": 6.769188833736781e-07, "epoch": 7.6073619631901845, "percentage": 76.07, "elapsed_time": "1:28:54", "remaining_time": "0:27:57"} -{"current_steps": 1241, "total_steps": 1630, "loss": 0.0072, "lr": 6.736252082953307e-07, "epoch": 7.613496932515337, "percentage": 76.13, "elapsed_time": "1:28:56", "remaining_time": "0:27:52"} -{"current_steps": 1242, "total_steps": 1630, "loss": 0.0046, "lr": 6.703383176616743e-07, "epoch": 7.61963190184049, "percentage": 76.2, "elapsed_time": "1:28:58", "remaining_time": "0:27:47"} -{"current_steps": 1243, "total_steps": 1630, "loss": 0.0742, "lr": 6.670582236825421e-07, "epoch": 7.625766871165644, "percentage": 76.26, "elapsed_time": "1:28:59", "remaining_time": "0:27:42"} -{"current_steps": 1244, "total_steps": 1630, "loss": 0.0069, "lr": 6.637849385425157e-07, "epoch": 7.631901840490798, "percentage": 76.32, "elapsed_time": "1:29:02", "remaining_time": "0:27:37"} -{"current_steps": 1245, "total_steps": 1630, "loss": 0.0031, "lr": 6.605184744008866e-07, "epoch": 7.638036809815951, "percentage": 76.38, "elapsed_time": "1:29:03", "remaining_time": "0:27:32"} -{"current_steps": 1246, "total_steps": 1630, "loss": 0.0316, "lr": 6.572588433916082e-07, "epoch": 7.644171779141105, "percentage": 76.44, "elapsed_time": "1:29:06", "remaining_time": "0:27:27"} -{"current_steps": 1247, "total_steps": 1630, "loss": 0.0472, "lr": 6.540060576232488e-07, "epoch": 7.6503067484662575, "percentage": 76.5, "elapsed_time": "1:29:10", "remaining_time": "0:27:23"} -{"current_steps": 1248, "total_steps": 1630, "loss": 0.0059, "lr": 6.507601291789515e-07, "epoch": 7.656441717791411, "percentage": 76.56, "elapsed_time": "1:29:11", "remaining_time": "0:27:17"} -{"current_steps": 1249, "total_steps": 1630, "loss": 0.0023, "lr": 6.475210701163828e-07, "epoch": 7.662576687116564, "percentage": 76.63, "elapsed_time": "1:29:13", "remaining_time": "0:27:13"} -{"current_steps": 1250, "total_steps": 1630, "loss": 0.0207, "lr": 6.442888924676951e-07, "epoch": 7.668711656441718, "percentage": 76.69, "elapsed_time": "1:29:17", "remaining_time": "0:27:08"} -{"current_steps": 1251, "total_steps": 1630, "loss": 0.002, "lr": 6.410636082394772e-07, "epoch": 7.674846625766871, "percentage": 76.75, "elapsed_time": "1:29:19", "remaining_time": "0:27:03"} -{"current_steps": 1252, "total_steps": 1630, "loss": 0.0038, "lr": 6.378452294127091e-07, "epoch": 7.680981595092025, "percentage": 76.81, "elapsed_time": "1:29:22", "remaining_time": "0:26:59"} -{"current_steps": 1253, "total_steps": 1630, "loss": 0.0024, "lr": 6.346337679427214e-07, "epoch": 7.6871165644171775, "percentage": 76.87, "elapsed_time": "1:29:25", "remaining_time": "0:26:54"} -{"current_steps": 1254, "total_steps": 1630, "loss": 0.0027, "lr": 6.314292357591489e-07, "epoch": 7.693251533742331, "percentage": 76.93, "elapsed_time": "1:29:27", "remaining_time": "0:26:49"} -{"current_steps": 1255, "total_steps": 1630, "loss": 0.0048, "lr": 6.282316447658837e-07, "epoch": 7.699386503067485, "percentage": 76.99, "elapsed_time": "1:29:29", "remaining_time": "0:26:44"} -{"current_steps": 1256, "total_steps": 1630, "loss": 0.0064, "lr": 6.250410068410367e-07, "epoch": 7.705521472392638, "percentage": 77.06, "elapsed_time": "1:29:30", "remaining_time": "0:26:39"} -{"current_steps": 1257, "total_steps": 1630, "loss": 0.0056, "lr": 6.218573338368869e-07, "epoch": 7.711656441717792, "percentage": 77.12, "elapsed_time": "1:29:32", "remaining_time": "0:26:34"} -{"current_steps": 1258, "total_steps": 1630, "loss": 0.0073, "lr": 6.186806375798429e-07, "epoch": 7.717791411042945, "percentage": 77.18, "elapsed_time": "1:29:35", "remaining_time": "0:26:29"} -{"current_steps": 1259, "total_steps": 1630, "loss": 0.0043, "lr": 6.155109298703968e-07, "epoch": 7.723926380368098, "percentage": 77.24, "elapsed_time": "1:29:39", "remaining_time": "0:26:25"} -{"current_steps": 1260, "total_steps": 1630, "loss": 0.0108, "lr": 6.123482224830787e-07, "epoch": 7.730061349693251, "percentage": 77.3, "elapsed_time": "1:29:41", "remaining_time": "0:26:20"} -{"current_steps": 1261, "total_steps": 1630, "loss": 0.0337, "lr": 6.091925271664156e-07, "epoch": 7.736196319018405, "percentage": 77.36, "elapsed_time": "1:29:44", "remaining_time": "0:26:15"} -{"current_steps": 1262, "total_steps": 1630, "loss": 0.0019, "lr": 6.060438556428877e-07, "epoch": 7.742331288343558, "percentage": 77.42, "elapsed_time": "1:29:46", "remaining_time": "0:26:10"} -{"current_steps": 1263, "total_steps": 1630, "loss": 0.0089, "lr": 6.02902219608881e-07, "epoch": 7.748466257668712, "percentage": 77.48, "elapsed_time": "1:29:48", "remaining_time": "0:26:05"} -{"current_steps": 1264, "total_steps": 1630, "loss": 0.0045, "lr": 5.997676307346504e-07, "epoch": 7.754601226993865, "percentage": 77.55, "elapsed_time": "1:29:50", "remaining_time": "0:26:00"} -{"current_steps": 1265, "total_steps": 1630, "loss": 0.0028, "lr": 5.966401006642689e-07, "epoch": 7.7607361963190185, "percentage": 77.61, "elapsed_time": "1:29:52", "remaining_time": "0:25:55"} -{"current_steps": 1266, "total_steps": 1630, "loss": 0.009, "lr": 5.93519641015591e-07, "epoch": 7.766871165644172, "percentage": 77.67, "elapsed_time": "1:29:53", "remaining_time": "0:25:50"} -{"current_steps": 1267, "total_steps": 1630, "loss": 0.0168, "lr": 5.904062633802066e-07, "epoch": 7.773006134969325, "percentage": 77.73, "elapsed_time": "1:29:59", "remaining_time": "0:25:47"} -{"current_steps": 1268, "total_steps": 1630, "loss": 0.0029, "lr": 5.872999793233952e-07, "epoch": 7.779141104294479, "percentage": 77.79, "elapsed_time": "1:30:00", "remaining_time": "0:25:41"} -{"current_steps": 1269, "total_steps": 1630, "loss": 0.015, "lr": 5.842008003840891e-07, "epoch": 7.785276073619632, "percentage": 77.85, "elapsed_time": "1:30:06", "remaining_time": "0:25:38"} -{"current_steps": 1270, "total_steps": 1630, "loss": 0.011, "lr": 5.811087380748245e-07, "epoch": 7.791411042944786, "percentage": 77.91, "elapsed_time": "1:30:10", "remaining_time": "0:25:33"} -{"current_steps": 1271, "total_steps": 1630, "loss": 0.0057, "lr": 5.780238038817035e-07, "epoch": 7.7975460122699385, "percentage": 77.98, "elapsed_time": "1:30:13", "remaining_time": "0:25:29"} -{"current_steps": 1272, "total_steps": 1630, "loss": 0.0131, "lr": 5.74946009264348e-07, "epoch": 7.803680981595092, "percentage": 78.04, "elapsed_time": "1:30:15", "remaining_time": "0:25:24"} -{"current_steps": 1273, "total_steps": 1630, "loss": 0.0088, "lr": 5.71875365655859e-07, "epoch": 7.809815950920245, "percentage": 78.1, "elapsed_time": "1:30:17", "remaining_time": "0:25:19"} -{"current_steps": 1274, "total_steps": 1630, "loss": 0.0033, "lr": 5.688118844627746e-07, "epoch": 7.815950920245399, "percentage": 78.16, "elapsed_time": "1:30:19", "remaining_time": "0:25:14"} -{"current_steps": 1275, "total_steps": 1630, "loss": 0.0206, "lr": 5.657555770650241e-07, "epoch": 7.822085889570552, "percentage": 78.22, "elapsed_time": "1:30:25", "remaining_time": "0:25:10"} -{"current_steps": 1276, "total_steps": 1630, "loss": 0.0096, "lr": 5.627064548158903e-07, "epoch": 7.828220858895706, "percentage": 78.28, "elapsed_time": "1:30:27", "remaining_time": "0:25:05"} -{"current_steps": 1277, "total_steps": 1630, "loss": 0.008, "lr": 5.596645290419653e-07, "epoch": 7.8343558282208585, "percentage": 78.34, "elapsed_time": "1:30:28", "remaining_time": "0:25:00"} -{"current_steps": 1278, "total_steps": 1630, "loss": 0.0016, "lr": 5.566298110431068e-07, "epoch": 7.840490797546012, "percentage": 78.4, "elapsed_time": "1:30:29", "remaining_time": "0:24:55"} -{"current_steps": 1279, "total_steps": 1630, "loss": 0.0033, "lr": 5.536023120924e-07, "epoch": 7.846625766871165, "percentage": 78.47, "elapsed_time": "1:30:33", "remaining_time": "0:24:51"} -{"current_steps": 1280, "total_steps": 1630, "loss": 0.0084, "lr": 5.505820434361108e-07, "epoch": 7.852760736196319, "percentage": 78.53, "elapsed_time": "1:30:36", "remaining_time": "0:24:46"} -{"current_steps": 1281, "total_steps": 1630, "loss": 0.0049, "lr": 5.47569016293649e-07, "epoch": 7.858895705521473, "percentage": 78.59, "elapsed_time": "1:30:38", "remaining_time": "0:24:41"} -{"current_steps": 1282, "total_steps": 1630, "loss": 0.0019, "lr": 5.445632418575239e-07, "epoch": 7.865030674846626, "percentage": 78.65, "elapsed_time": "1:30:40", "remaining_time": "0:24:36"} -{"current_steps": 1283, "total_steps": 1630, "loss": 0.0062, "lr": 5.415647312933015e-07, "epoch": 7.871165644171779, "percentage": 78.71, "elapsed_time": "1:30:42", "remaining_time": "0:24:31"} -{"current_steps": 1284, "total_steps": 1630, "loss": 0.0081, "lr": 5.385734957395664e-07, "epoch": 7.877300613496932, "percentage": 78.77, "elapsed_time": "1:30:44", "remaining_time": "0:24:27"} -{"current_steps": 1285, "total_steps": 1630, "loss": 0.0048, "lr": 5.355895463078789e-07, "epoch": 7.883435582822086, "percentage": 78.83, "elapsed_time": "1:30:48", "remaining_time": "0:24:22"} -{"current_steps": 1286, "total_steps": 1630, "loss": 0.0088, "lr": 5.326128940827313e-07, "epoch": 7.889570552147239, "percentage": 78.9, "elapsed_time": "1:30:51", "remaining_time": "0:24:18"} -{"current_steps": 1287, "total_steps": 1630, "loss": 0.0043, "lr": 5.296435501215116e-07, "epoch": 7.895705521472393, "percentage": 78.96, "elapsed_time": "1:30:53", "remaining_time": "0:24:13"} -{"current_steps": 1288, "total_steps": 1630, "loss": 0.0099, "lr": 5.266815254544572e-07, "epoch": 7.901840490797546, "percentage": 79.02, "elapsed_time": "1:30:57", "remaining_time": "0:24:09"} -{"current_steps": 1289, "total_steps": 1630, "loss": 0.0086, "lr": 5.237268310846183e-07, "epoch": 7.9079754601226995, "percentage": 79.08, "elapsed_time": "1:30:59", "remaining_time": "0:24:04"} -{"current_steps": 1290, "total_steps": 1630, "loss": 0.0442, "lr": 5.207794779878156e-07, "epoch": 7.914110429447852, "percentage": 79.14, "elapsed_time": "1:31:02", "remaining_time": "0:23:59"} -{"current_steps": 1291, "total_steps": 1630, "loss": 0.0071, "lr": 5.178394771125969e-07, "epoch": 7.920245398773006, "percentage": 79.2, "elapsed_time": "1:31:05", "remaining_time": "0:23:55"} -{"current_steps": 1292, "total_steps": 1630, "loss": 0.0192, "lr": 5.149068393802009e-07, "epoch": 7.92638036809816, "percentage": 79.26, "elapsed_time": "1:31:08", "remaining_time": "0:23:50"} -{"current_steps": 1293, "total_steps": 1630, "loss": 0.0032, "lr": 5.119815756845123e-07, "epoch": 7.932515337423313, "percentage": 79.33, "elapsed_time": "1:31:10", "remaining_time": "0:23:45"} -{"current_steps": 1294, "total_steps": 1630, "loss": 0.0139, "lr": 5.090636968920252e-07, "epoch": 7.938650306748467, "percentage": 79.39, "elapsed_time": "1:31:13", "remaining_time": "0:23:41"} -{"current_steps": 1295, "total_steps": 1630, "loss": 0.0071, "lr": 5.061532138418013e-07, "epoch": 7.9447852760736195, "percentage": 79.45, "elapsed_time": "1:31:15", "remaining_time": "0:23:36"} -{"current_steps": 1296, "total_steps": 1630, "loss": 0.0056, "lr": 5.032501373454266e-07, "epoch": 7.950920245398773, "percentage": 79.51, "elapsed_time": "1:31:17", "remaining_time": "0:23:31"} -{"current_steps": 1297, "total_steps": 1630, "loss": 0.0239, "lr": 5.003544781869762e-07, "epoch": 7.957055214723926, "percentage": 79.57, "elapsed_time": "1:31:23", "remaining_time": "0:23:27"} -{"current_steps": 1298, "total_steps": 1630, "loss": 0.0336, "lr": 4.974662471229727e-07, "epoch": 7.96319018404908, "percentage": 79.63, "elapsed_time": "1:31:29", "remaining_time": "0:23:24"} -{"current_steps": 1299, "total_steps": 1630, "loss": 0.0049, "lr": 4.945854548823425e-07, "epoch": 7.969325153374233, "percentage": 79.69, "elapsed_time": "1:31:31", "remaining_time": "0:23:19"} -{"current_steps": 1300, "total_steps": 1630, "loss": 0.0103, "lr": 4.917121121663823e-07, "epoch": 7.975460122699387, "percentage": 79.75, "elapsed_time": "1:31:37", "remaining_time": "0:23:15"} -{"current_steps": 1301, "total_steps": 1630, "loss": 0.0036, "lr": 4.888462296487129e-07, "epoch": 7.9815950920245395, "percentage": 79.82, "elapsed_time": "1:31:38", "remaining_time": "0:23:10"} -{"current_steps": 1302, "total_steps": 1630, "loss": 0.0119, "lr": 4.859878179752448e-07, "epoch": 7.987730061349693, "percentage": 79.88, "elapsed_time": "1:31:40", "remaining_time": "0:23:05"} -{"current_steps": 1303, "total_steps": 1630, "loss": 0.0365, "lr": 4.83136887764136e-07, "epoch": 7.993865030674847, "percentage": 79.94, "elapsed_time": "1:31:44", "remaining_time": "0:23:01"} -{"current_steps": 1304, "total_steps": 1630, "loss": 0.0046, "lr": 4.802934496057527e-07, "epoch": 8.0, "percentage": 80.0, "elapsed_time": "1:31:47", "remaining_time": "0:22:56"} -{"current_steps": 1305, "total_steps": 1630, "loss": 0.0235, "lr": 4.774575140626317e-07, "epoch": 8.006134969325153, "percentage": 80.06, "elapsed_time": "1:35:51", "remaining_time": "0:23:52"} -{"current_steps": 1306, "total_steps": 1630, "loss": 0.0029, "lr": 4.746290916694368e-07, "epoch": 8.012269938650308, "percentage": 80.12, "elapsed_time": "1:35:53", "remaining_time": "0:23:47"} -{"current_steps": 1307, "total_steps": 1630, "loss": 0.0019, "lr": 4.71808192932926e-07, "epoch": 8.01840490797546, "percentage": 80.18, "elapsed_time": "1:35:57", "remaining_time": "0:23:42"} -{"current_steps": 1308, "total_steps": 1630, "loss": 0.0024, "lr": 4.6899482833190765e-07, "epoch": 8.024539877300613, "percentage": 80.25, "elapsed_time": "1:36:01", "remaining_time": "0:23:38"} -{"current_steps": 1309, "total_steps": 1630, "loss": 0.0166, "lr": 4.661890083172019e-07, "epoch": 8.030674846625766, "percentage": 80.31, "elapsed_time": "1:36:05", "remaining_time": "0:23:33"} -{"current_steps": 1310, "total_steps": 1630, "loss": 0.0047, "lr": 4.633907433116053e-07, "epoch": 8.036809815950921, "percentage": 80.37, "elapsed_time": "1:36:09", "remaining_time": "0:23:29"} -{"current_steps": 1311, "total_steps": 1630, "loss": 0.0013, "lr": 4.6060004370984763e-07, "epoch": 8.042944785276074, "percentage": 80.43, "elapsed_time": "1:36:11", "remaining_time": "0:23:24"} -{"current_steps": 1312, "total_steps": 1630, "loss": 0.0016, "lr": 4.5781691987855676e-07, "epoch": 8.049079754601227, "percentage": 80.49, "elapsed_time": "1:36:13", "remaining_time": "0:23:19"} -{"current_steps": 1313, "total_steps": 1630, "loss": 0.0026, "lr": 4.5504138215621915e-07, "epoch": 8.05521472392638, "percentage": 80.55, "elapsed_time": "1:36:17", "remaining_time": "0:23:14"} -{"current_steps": 1314, "total_steps": 1630, "loss": 0.002, "lr": 4.5227344085313873e-07, "epoch": 8.061349693251534, "percentage": 80.61, "elapsed_time": "1:36:20", "remaining_time": "0:23:10"} -{"current_steps": 1315, "total_steps": 1630, "loss": 0.0299, "lr": 4.495131062514038e-07, "epoch": 8.067484662576687, "percentage": 80.67, "elapsed_time": "1:36:23", "remaining_time": "0:23:05"} -{"current_steps": 1316, "total_steps": 1630, "loss": 0.003, "lr": 4.467603886048452e-07, "epoch": 8.07361963190184, "percentage": 80.74, "elapsed_time": "1:36:27", "remaining_time": "0:23:01"} -{"current_steps": 1317, "total_steps": 1630, "loss": 0.0129, "lr": 4.440152981389972e-07, "epoch": 8.079754601226995, "percentage": 80.8, "elapsed_time": "1:36:31", "remaining_time": "0:22:56"} -{"current_steps": 1318, "total_steps": 1630, "loss": 0.0086, "lr": 4.412778450510641e-07, "epoch": 8.085889570552148, "percentage": 80.86, "elapsed_time": "1:36:37", "remaining_time": "0:22:52"} -{"current_steps": 1319, "total_steps": 1630, "loss": 0.002, "lr": 4.3854803950987736e-07, "epoch": 8.0920245398773, "percentage": 80.92, "elapsed_time": "1:36:42", "remaining_time": "0:22:48"} -{"current_steps": 1320, "total_steps": 1630, "loss": 0.0016, "lr": 4.358258916558611e-07, "epoch": 8.098159509202453, "percentage": 80.98, "elapsed_time": "1:36:44", "remaining_time": "0:22:43"} -{"current_steps": 1321, "total_steps": 1630, "loss": 0.0156, "lr": 4.331114116009938e-07, "epoch": 8.104294478527608, "percentage": 81.04, "elapsed_time": "1:36:47", "remaining_time": "0:22:38"} -{"current_steps": 1322, "total_steps": 1630, "loss": 0.0021, "lr": 4.3040460942876896e-07, "epoch": 8.110429447852761, "percentage": 81.1, "elapsed_time": "1:36:50", "remaining_time": "0:22:33"} -{"current_steps": 1323, "total_steps": 1630, "loss": 0.0021, "lr": 4.277054951941609e-07, "epoch": 8.116564417177914, "percentage": 81.17, "elapsed_time": "1:36:52", "remaining_time": "0:22:28"} -{"current_steps": 1324, "total_steps": 1630, "loss": 0.0036, "lr": 4.250140789235829e-07, "epoch": 8.122699386503067, "percentage": 81.23, "elapsed_time": "1:36:55", "remaining_time": "0:22:23"} -{"current_steps": 1325, "total_steps": 1630, "loss": 0.0031, "lr": 4.223303706148549e-07, "epoch": 8.128834355828221, "percentage": 81.29, "elapsed_time": "1:36:57", "remaining_time": "0:22:19"} -{"current_steps": 1326, "total_steps": 1630, "loss": 0.0102, "lr": 4.196543802371641e-07, "epoch": 8.134969325153374, "percentage": 81.35, "elapsed_time": "1:37:02", "remaining_time": "0:22:14"} -{"current_steps": 1327, "total_steps": 1630, "loss": 0.0023, "lr": 4.1698611773102525e-07, "epoch": 8.141104294478527, "percentage": 81.41, "elapsed_time": "1:37:04", "remaining_time": "0:22:10"} -{"current_steps": 1328, "total_steps": 1630, "loss": 0.0074, "lr": 4.14325593008249e-07, "epoch": 8.14723926380368, "percentage": 81.47, "elapsed_time": "1:37:11", "remaining_time": "0:22:06"} -{"current_steps": 1329, "total_steps": 1630, "loss": 0.0017, "lr": 4.1167281595190206e-07, "epoch": 8.153374233128835, "percentage": 81.53, "elapsed_time": "1:37:12", "remaining_time": "0:22:00"} -{"current_steps": 1330, "total_steps": 1630, "loss": 0.0013, "lr": 4.090277964162692e-07, "epoch": 8.159509202453988, "percentage": 81.6, "elapsed_time": "1:37:14", "remaining_time": "0:21:56"} -{"current_steps": 1331, "total_steps": 1630, "loss": 0.0014, "lr": 4.063905442268201e-07, "epoch": 8.16564417177914, "percentage": 81.66, "elapsed_time": "1:37:17", "remaining_time": "0:21:51"} -{"current_steps": 1332, "total_steps": 1630, "loss": 0.0009, "lr": 4.037610691801694e-07, "epoch": 8.171779141104295, "percentage": 81.72, "elapsed_time": "1:37:21", "remaining_time": "0:21:46"} -{"current_steps": 1333, "total_steps": 1630, "loss": 0.0022, "lr": 4.011393810440431e-07, "epoch": 8.177914110429448, "percentage": 81.78, "elapsed_time": "1:37:23", "remaining_time": "0:21:41"} -{"current_steps": 1334, "total_steps": 1630, "loss": 0.0024, "lr": 3.985254895572413e-07, "epoch": 8.184049079754601, "percentage": 81.84, "elapsed_time": "1:37:24", "remaining_time": "0:21:36"} -{"current_steps": 1335, "total_steps": 1630, "loss": 0.0011, "lr": 3.959194044296011e-07, "epoch": 8.190184049079754, "percentage": 81.9, "elapsed_time": "1:37:28", "remaining_time": "0:21:32"} -{"current_steps": 1336, "total_steps": 1630, "loss": 0.0028, "lr": 3.9332113534196194e-07, "epoch": 8.196319018404909, "percentage": 81.96, "elapsed_time": "1:37:30", "remaining_time": "0:21:27"} -{"current_steps": 1337, "total_steps": 1630, "loss": 0.0228, "lr": 3.907306919461279e-07, "epoch": 8.202453987730062, "percentage": 82.02, "elapsed_time": "1:37:34", "remaining_time": "0:21:22"} -{"current_steps": 1338, "total_steps": 1630, "loss": 0.0027, "lr": 3.8814808386483385e-07, "epoch": 8.208588957055214, "percentage": 82.09, "elapsed_time": "1:37:39", "remaining_time": "0:21:18"} -{"current_steps": 1339, "total_steps": 1630, "loss": 0.0037, "lr": 3.855733206917095e-07, "epoch": 8.214723926380367, "percentage": 82.15, "elapsed_time": "1:37:41", "remaining_time": "0:21:13"} -{"current_steps": 1340, "total_steps": 1630, "loss": 0.0011, "lr": 3.8300641199124024e-07, "epoch": 8.220858895705522, "percentage": 82.21, "elapsed_time": "1:37:44", "remaining_time": "0:21:09"} -{"current_steps": 1341, "total_steps": 1630, "loss": 0.0008, "lr": 3.80447367298738e-07, "epoch": 8.226993865030675, "percentage": 82.27, "elapsed_time": "1:37:47", "remaining_time": "0:21:04"} -{"current_steps": 1342, "total_steps": 1630, "loss": 0.0012, "lr": 3.77896196120299e-07, "epoch": 8.233128834355828, "percentage": 82.33, "elapsed_time": "1:37:49", "remaining_time": "0:20:59"} -{"current_steps": 1343, "total_steps": 1630, "loss": 0.0047, "lr": 3.7535290793277364e-07, "epoch": 8.239263803680982, "percentage": 82.39, "elapsed_time": "1:37:54", "remaining_time": "0:20:55"} -{"current_steps": 1344, "total_steps": 1630, "loss": 0.0007, "lr": 3.7281751218372965e-07, "epoch": 8.245398773006135, "percentage": 82.45, "elapsed_time": "1:37:57", "remaining_time": "0:20:50"} -{"current_steps": 1345, "total_steps": 1630, "loss": 0.0018, "lr": 3.7029001829141457e-07, "epoch": 8.251533742331288, "percentage": 82.52, "elapsed_time": "1:38:00", "remaining_time": "0:20:46"} -{"current_steps": 1346, "total_steps": 1630, "loss": 0.0213, "lr": 3.677704356447254e-07, "epoch": 8.257668711656441, "percentage": 82.58, "elapsed_time": "1:38:03", "remaining_time": "0:20:41"} -{"current_steps": 1347, "total_steps": 1630, "loss": 0.0009, "lr": 3.6525877360316875e-07, "epoch": 8.263803680981596, "percentage": 82.64, "elapsed_time": "1:38:04", "remaining_time": "0:20:36"} -{"current_steps": 1348, "total_steps": 1630, "loss": 0.0132, "lr": 3.627550414968303e-07, "epoch": 8.269938650306749, "percentage": 82.7, "elapsed_time": "1:38:10", "remaining_time": "0:20:32"} -{"current_steps": 1349, "total_steps": 1630, "loss": 0.0006, "lr": 3.6025924862633814e-07, "epoch": 8.276073619631902, "percentage": 82.76, "elapsed_time": "1:38:12", "remaining_time": "0:20:27"} -{"current_steps": 1350, "total_steps": 1630, "loss": 0.01, "lr": 3.577714042628272e-07, "epoch": 8.282208588957054, "percentage": 82.82, "elapsed_time": "1:38:14", "remaining_time": "0:20:22"} -{"current_steps": 1351, "total_steps": 1630, "loss": 0.0031, "lr": 3.5529151764790715e-07, "epoch": 8.28834355828221, "percentage": 82.88, "elapsed_time": "1:38:16", "remaining_time": "0:20:17"} -{"current_steps": 1352, "total_steps": 1630, "loss": 0.0053, "lr": 3.5281959799362775e-07, "epoch": 8.294478527607362, "percentage": 82.94, "elapsed_time": "1:38:22", "remaining_time": "0:20:13"} -{"current_steps": 1353, "total_steps": 1630, "loss": 0.0021, "lr": 3.503556544824413e-07, "epoch": 8.300613496932515, "percentage": 83.01, "elapsed_time": "1:38:25", "remaining_time": "0:20:08"} -{"current_steps": 1354, "total_steps": 1630, "loss": 0.0019, "lr": 3.4789969626717377e-07, "epoch": 8.30674846625767, "percentage": 83.07, "elapsed_time": "1:38:26", "remaining_time": "0:20:03"} -{"current_steps": 1355, "total_steps": 1630, "loss": 0.0019, "lr": 3.454517324709858e-07, "epoch": 8.312883435582823, "percentage": 83.13, "elapsed_time": "1:38:26", "remaining_time": "0:19:58"} -{"current_steps": 1356, "total_steps": 1630, "loss": 0.0011, "lr": 3.43011772187343e-07, "epoch": 8.319018404907975, "percentage": 83.19, "elapsed_time": "1:38:28", "remaining_time": "0:19:53"} -{"current_steps": 1357, "total_steps": 1630, "loss": 0.0006, "lr": 3.405798244799799e-07, "epoch": 8.325153374233128, "percentage": 83.25, "elapsed_time": "1:38:30", "remaining_time": "0:19:49"} -{"current_steps": 1358, "total_steps": 1630, "loss": 0.002, "lr": 3.3815589838286535e-07, "epoch": 8.331288343558283, "percentage": 83.31, "elapsed_time": "1:38:32", "remaining_time": "0:19:44"} -{"current_steps": 1359, "total_steps": 1630, "loss": 0.002, "lr": 3.3574000290017174e-07, "epoch": 8.337423312883436, "percentage": 83.37, "elapsed_time": "1:38:34", "remaining_time": "0:19:39"} -{"current_steps": 1360, "total_steps": 1630, "loss": 0.0153, "lr": 3.3333214700623976e-07, "epoch": 8.343558282208589, "percentage": 83.44, "elapsed_time": "1:38:41", "remaining_time": "0:19:35"} -{"current_steps": 1361, "total_steps": 1630, "loss": 0.0014, "lr": 3.3093233964554464e-07, "epoch": 8.349693251533742, "percentage": 83.5, "elapsed_time": "1:38:44", "remaining_time": "0:19:31"} -{"current_steps": 1362, "total_steps": 1630, "loss": 0.0107, "lr": 3.2854058973266547e-07, "epoch": 8.355828220858896, "percentage": 83.56, "elapsed_time": "1:38:51", "remaining_time": "0:19:27"} -{"current_steps": 1363, "total_steps": 1630, "loss": 0.0197, "lr": 3.261569061522474e-07, "epoch": 8.36196319018405, "percentage": 83.62, "elapsed_time": "1:38:54", "remaining_time": "0:19:22"} -{"current_steps": 1364, "total_steps": 1630, "loss": 0.0009, "lr": 3.237812977589738e-07, "epoch": 8.368098159509202, "percentage": 83.68, "elapsed_time": "1:38:56", "remaining_time": "0:19:17"} -{"current_steps": 1365, "total_steps": 1630, "loss": 0.0026, "lr": 3.2141377337753105e-07, "epoch": 8.374233128834355, "percentage": 83.74, "elapsed_time": "1:38:59", "remaining_time": "0:19:13"} -{"current_steps": 1366, "total_steps": 1630, "loss": 0.0533, "lr": 3.190543418025749e-07, "epoch": 8.38036809815951, "percentage": 83.8, "elapsed_time": "1:39:03", "remaining_time": "0:19:08"} -{"current_steps": 1367, "total_steps": 1630, "loss": 0.0007, "lr": 3.167030117986994e-07, "epoch": 8.386503067484663, "percentage": 83.87, "elapsed_time": "1:39:06", "remaining_time": "0:19:04"} -{"current_steps": 1368, "total_steps": 1630, "loss": 0.001, "lr": 3.143597921004027e-07, "epoch": 8.392638036809815, "percentage": 83.93, "elapsed_time": "1:39:10", "remaining_time": "0:18:59"} -{"current_steps": 1369, "total_steps": 1630, "loss": 0.002, "lr": 3.120246914120564e-07, "epoch": 8.39877300613497, "percentage": 83.99, "elapsed_time": "1:39:11", "remaining_time": "0:18:54"} -{"current_steps": 1370, "total_steps": 1630, "loss": 0.0025, "lr": 3.096977184078731e-07, "epoch": 8.404907975460123, "percentage": 84.05, "elapsed_time": "1:39:14", "remaining_time": "0:18:50"} -{"current_steps": 1371, "total_steps": 1630, "loss": 0.0014, "lr": 3.0737888173187067e-07, "epoch": 8.411042944785276, "percentage": 84.11, "elapsed_time": "1:39:16", "remaining_time": "0:18:45"} -{"current_steps": 1372, "total_steps": 1630, "loss": 0.0149, "lr": 3.050681899978464e-07, "epoch": 8.417177914110429, "percentage": 84.17, "elapsed_time": "1:39:19", "remaining_time": "0:18:40"} -{"current_steps": 1373, "total_steps": 1630, "loss": 0.0178, "lr": 3.0276565178933847e-07, "epoch": 8.423312883435583, "percentage": 84.23, "elapsed_time": "1:39:25", "remaining_time": "0:18:36"} -{"current_steps": 1374, "total_steps": 1630, "loss": 0.0053, "lr": 3.004712756595993e-07, "epoch": 8.429447852760736, "percentage": 84.29, "elapsed_time": "1:39:28", "remaining_time": "0:18:32"} -{"current_steps": 1375, "total_steps": 1630, "loss": 0.0013, "lr": 2.9818507013156085e-07, "epoch": 8.43558282208589, "percentage": 84.36, "elapsed_time": "1:39:30", "remaining_time": "0:18:27"} -{"current_steps": 1376, "total_steps": 1630, "loss": 0.0039, "lr": 2.9590704369780313e-07, "epoch": 8.441717791411042, "percentage": 84.42, "elapsed_time": "1:39:32", "remaining_time": "0:18:22"} -{"current_steps": 1377, "total_steps": 1630, "loss": 0.0025, "lr": 2.9363720482052436e-07, "epoch": 8.447852760736197, "percentage": 84.48, "elapsed_time": "1:39:33", "remaining_time": "0:18:17"} -{"current_steps": 1378, "total_steps": 1630, "loss": 0.0033, "lr": 2.91375561931507e-07, "epoch": 8.45398773006135, "percentage": 84.54, "elapsed_time": "1:39:36", "remaining_time": "0:18:12"} -{"current_steps": 1379, "total_steps": 1630, "loss": 0.0006, "lr": 2.89122123432089e-07, "epoch": 8.460122699386503, "percentage": 84.6, "elapsed_time": "1:39:39", "remaining_time": "0:18:08"} -{"current_steps": 1380, "total_steps": 1630, "loss": 0.001, "lr": 2.868768976931313e-07, "epoch": 8.466257668711656, "percentage": 84.66, "elapsed_time": "1:39:40", "remaining_time": "0:18:03"} -{"current_steps": 1381, "total_steps": 1630, "loss": 0.0008, "lr": 2.8463989305498596e-07, "epoch": 8.47239263803681, "percentage": 84.72, "elapsed_time": "1:39:43", "remaining_time": "0:17:58"} -{"current_steps": 1382, "total_steps": 1630, "loss": 0.0025, "lr": 2.824111178274669e-07, "epoch": 8.478527607361963, "percentage": 84.79, "elapsed_time": "1:39:47", "remaining_time": "0:17:54"} -{"current_steps": 1383, "total_steps": 1630, "loss": 0.0031, "lr": 2.801905802898183e-07, "epoch": 8.484662576687116, "percentage": 84.85, "elapsed_time": "1:39:49", "remaining_time": "0:17:49"} -{"current_steps": 1384, "total_steps": 1630, "loss": 0.0013, "lr": 2.779782886906829e-07, "epoch": 8.49079754601227, "percentage": 84.91, "elapsed_time": "1:39:51", "remaining_time": "0:17:44"} -{"current_steps": 1385, "total_steps": 1630, "loss": 0.0013, "lr": 2.7577425124807324e-07, "epoch": 8.496932515337424, "percentage": 84.97, "elapsed_time": "1:39:55", "remaining_time": "0:17:40"} -{"current_steps": 1386, "total_steps": 1630, "loss": 0.0031, "lr": 2.7357847614933876e-07, "epoch": 8.503067484662576, "percentage": 85.03, "elapsed_time": "1:39:57", "remaining_time": "0:17:35"} -{"current_steps": 1387, "total_steps": 1630, "loss": 0.0045, "lr": 2.713909715511384e-07, "epoch": 8.50920245398773, "percentage": 85.09, "elapsed_time": "1:40:00", "remaining_time": "0:17:31"} -{"current_steps": 1388, "total_steps": 1630, "loss": 0.0017, "lr": 2.692117455794077e-07, "epoch": 8.515337423312884, "percentage": 85.15, "elapsed_time": "1:40:02", "remaining_time": "0:17:26"} -{"current_steps": 1389, "total_steps": 1630, "loss": 0.0014, "lr": 2.6704080632932895e-07, "epoch": 8.521472392638037, "percentage": 85.21, "elapsed_time": "1:40:05", "remaining_time": "0:17:21"} -{"current_steps": 1390, "total_steps": 1630, "loss": 0.002, "lr": 2.6487816186530263e-07, "epoch": 8.52760736196319, "percentage": 85.28, "elapsed_time": "1:40:08", "remaining_time": "0:17:17"} -{"current_steps": 1391, "total_steps": 1630, "loss": 0.0028, "lr": 2.6272382022091704e-07, "epoch": 8.533742331288344, "percentage": 85.34, "elapsed_time": "1:40:12", "remaining_time": "0:17:13"} -{"current_steps": 1392, "total_steps": 1630, "loss": 0.011, "lr": 2.6057778939891614e-07, "epoch": 8.539877300613497, "percentage": 85.4, "elapsed_time": "1:40:19", "remaining_time": "0:17:09"} -{"current_steps": 1393, "total_steps": 1630, "loss": 0.0037, "lr": 2.584400773711737e-07, "epoch": 8.54601226993865, "percentage": 85.46, "elapsed_time": "1:40:22", "remaining_time": "0:17:04"} -{"current_steps": 1394, "total_steps": 1630, "loss": 0.0023, "lr": 2.5631069207865926e-07, "epoch": 8.552147239263803, "percentage": 85.52, "elapsed_time": "1:40:25", "remaining_time": "0:17:00"} -{"current_steps": 1395, "total_steps": 1630, "loss": 0.0053, "lr": 2.541896414314132e-07, "epoch": 8.558282208588958, "percentage": 85.58, "elapsed_time": "1:40:28", "remaining_time": "0:16:55"} -{"current_steps": 1396, "total_steps": 1630, "loss": 0.0008, "lr": 2.520769333085141e-07, "epoch": 8.56441717791411, "percentage": 85.64, "elapsed_time": "1:40:30", "remaining_time": "0:16:50"} -{"current_steps": 1397, "total_steps": 1630, "loss": 0.001, "lr": 2.4997257555805064e-07, "epoch": 8.570552147239264, "percentage": 85.71, "elapsed_time": "1:40:34", "remaining_time": "0:16:46"} -{"current_steps": 1398, "total_steps": 1630, "loss": 0.0041, "lr": 2.4787657599709276e-07, "epoch": 8.576687116564417, "percentage": 85.77, "elapsed_time": "1:40:36", "remaining_time": "0:16:41"} -{"current_steps": 1399, "total_steps": 1630, "loss": 0.0029, "lr": 2.4578894241166135e-07, "epoch": 8.582822085889571, "percentage": 85.83, "elapsed_time": "1:40:39", "remaining_time": "0:16:37"} -{"current_steps": 1400, "total_steps": 1630, "loss": 0.001, "lr": 2.4370968255670093e-07, "epoch": 8.588957055214724, "percentage": 85.89, "elapsed_time": "1:40:43", "remaining_time": "0:16:32"} -{"current_steps": 1401, "total_steps": 1630, "loss": 0.0005, "lr": 2.4163880415604913e-07, "epoch": 8.595092024539877, "percentage": 85.95, "elapsed_time": "1:40:46", "remaining_time": "0:16:28"} -{"current_steps": 1402, "total_steps": 1630, "loss": 0.0034, "lr": 2.395763149024102e-07, "epoch": 8.60122699386503, "percentage": 86.01, "elapsed_time": "1:40:48", "remaining_time": "0:16:23"} -{"current_steps": 1403, "total_steps": 1630, "loss": 0.0036, "lr": 2.3752222245732454e-07, "epoch": 8.607361963190185, "percentage": 86.07, "elapsed_time": "1:40:49", "remaining_time": "0:16:18"} -{"current_steps": 1404, "total_steps": 1630, "loss": 0.0013, "lr": 2.3547653445114032e-07, "epoch": 8.613496932515337, "percentage": 86.13, "elapsed_time": "1:40:51", "remaining_time": "0:16:14"} -{"current_steps": 1405, "total_steps": 1630, "loss": 0.0008, "lr": 2.334392584829867e-07, "epoch": 8.61963190184049, "percentage": 86.2, "elapsed_time": "1:40:52", "remaining_time": "0:16:09"} -{"current_steps": 1406, "total_steps": 1630, "loss": 0.0198, "lr": 2.3141040212074445e-07, "epoch": 8.625766871165645, "percentage": 86.26, "elapsed_time": "1:40:56", "remaining_time": "0:16:04"} -{"current_steps": 1407, "total_steps": 1630, "loss": 0.0033, "lr": 2.293899729010171e-07, "epoch": 8.631901840490798, "percentage": 86.32, "elapsed_time": "1:41:01", "remaining_time": "0:16:00"} -{"current_steps": 1408, "total_steps": 1630, "loss": 0.0007, "lr": 2.2737797832910498e-07, "epoch": 8.63803680981595, "percentage": 86.38, "elapsed_time": "1:41:02", "remaining_time": "0:15:55"} -{"current_steps": 1409, "total_steps": 1630, "loss": 0.0045, "lr": 2.2537442587897474e-07, "epoch": 8.644171779141104, "percentage": 86.44, "elapsed_time": "1:41:08", "remaining_time": "0:15:51"} -{"current_steps": 1410, "total_steps": 1630, "loss": 0.001, "lr": 2.2337932299323434e-07, "epoch": 8.650306748466258, "percentage": 86.5, "elapsed_time": "1:41:13", "remaining_time": "0:15:47"} -{"current_steps": 1411, "total_steps": 1630, "loss": 0.0005, "lr": 2.2139267708310457e-07, "epoch": 8.656441717791411, "percentage": 86.56, "elapsed_time": "1:41:15", "remaining_time": "0:15:42"} -{"current_steps": 1412, "total_steps": 1630, "loss": 0.0022, "lr": 2.194144955283886e-07, "epoch": 8.662576687116564, "percentage": 86.63, "elapsed_time": "1:41:19", "remaining_time": "0:15:38"} -{"current_steps": 1413, "total_steps": 1630, "loss": 0.0023, "lr": 2.1744478567744947e-07, "epoch": 8.668711656441717, "percentage": 86.69, "elapsed_time": "1:41:23", "remaining_time": "0:15:34"} -{"current_steps": 1414, "total_steps": 1630, "loss": 0.0027, "lr": 2.154835548471798e-07, "epoch": 8.674846625766872, "percentage": 86.75, "elapsed_time": "1:41:25", "remaining_time": "0:15:29"} -{"current_steps": 1415, "total_steps": 1630, "loss": 0.0015, "lr": 2.1353081032297356e-07, "epoch": 8.680981595092025, "percentage": 86.81, "elapsed_time": "1:41:28", "remaining_time": "0:15:25"} -{"current_steps": 1416, "total_steps": 1630, "loss": 0.0025, "lr": 2.1158655935870325e-07, "epoch": 8.687116564417177, "percentage": 86.87, "elapsed_time": "1:41:30", "remaining_time": "0:15:20"} -{"current_steps": 1417, "total_steps": 1630, "loss": 0.002, "lr": 2.0965080917668744e-07, "epoch": 8.69325153374233, "percentage": 86.93, "elapsed_time": "1:41:32", "remaining_time": "0:15:15"} -{"current_steps": 1418, "total_steps": 1630, "loss": 0.0023, "lr": 2.077235669676689e-07, "epoch": 8.699386503067485, "percentage": 86.99, "elapsed_time": "1:41:34", "remaining_time": "0:15:11"} -{"current_steps": 1419, "total_steps": 1630, "loss": 0.0005, "lr": 2.0580483989078525e-07, "epoch": 8.705521472392638, "percentage": 87.06, "elapsed_time": "1:41:35", "remaining_time": "0:15:06"} -{"current_steps": 1420, "total_steps": 1630, "loss": 0.0122, "lr": 2.0389463507354211e-07, "epoch": 8.71165644171779, "percentage": 87.12, "elapsed_time": "1:41:42", "remaining_time": "0:15:02"} -{"current_steps": 1421, "total_steps": 1630, "loss": 0.0005, "lr": 2.0199295961178893e-07, "epoch": 8.717791411042946, "percentage": 87.18, "elapsed_time": "1:41:46", "remaining_time": "0:14:58"} -{"current_steps": 1422, "total_steps": 1630, "loss": 0.004, "lr": 2.000998205696894e-07, "epoch": 8.723926380368098, "percentage": 87.24, "elapsed_time": "1:41:48", "remaining_time": "0:14:53"} -{"current_steps": 1423, "total_steps": 1630, "loss": 0.0004, "lr": 1.9821522497969813e-07, "epoch": 8.730061349693251, "percentage": 87.3, "elapsed_time": "1:41:49", "remaining_time": "0:14:48"} -{"current_steps": 1424, "total_steps": 1630, "loss": 0.001, "lr": 1.9633917984253294e-07, "epoch": 8.736196319018404, "percentage": 87.36, "elapsed_time": "1:41:51", "remaining_time": "0:14:44"} -{"current_steps": 1425, "total_steps": 1630, "loss": 0.0019, "lr": 1.944716921271489e-07, "epoch": 8.742331288343559, "percentage": 87.42, "elapsed_time": "1:41:54", "remaining_time": "0:14:39"} -{"current_steps": 1426, "total_steps": 1630, "loss": 0.0055, "lr": 1.9261276877071354e-07, "epoch": 8.748466257668712, "percentage": 87.48, "elapsed_time": "1:41:57", "remaining_time": "0:14:35"} -{"current_steps": 1427, "total_steps": 1630, "loss": 0.0048, "lr": 1.9076241667857988e-07, "epoch": 8.754601226993865, "percentage": 87.55, "elapsed_time": "1:42:00", "remaining_time": "0:14:30"} -{"current_steps": 1428, "total_steps": 1630, "loss": 0.0079, "lr": 1.8892064272426042e-07, "epoch": 8.76073619631902, "percentage": 87.61, "elapsed_time": "1:42:03", "remaining_time": "0:14:26"} -{"current_steps": 1429, "total_steps": 1630, "loss": 0.0013, "lr": 1.8708745374940469e-07, "epoch": 8.766871165644172, "percentage": 87.67, "elapsed_time": "1:42:04", "remaining_time": "0:14:21"} -{"current_steps": 1430, "total_steps": 1630, "loss": 0.0046, "lr": 1.8526285656376873e-07, "epoch": 8.773006134969325, "percentage": 87.73, "elapsed_time": "1:42:10", "remaining_time": "0:14:17"} -{"current_steps": 1431, "total_steps": 1630, "loss": 0.006, "lr": 1.8344685794519507e-07, "epoch": 8.779141104294478, "percentage": 87.79, "elapsed_time": "1:42:16", "remaining_time": "0:14:13"} -{"current_steps": 1432, "total_steps": 1630, "loss": 0.0094, "lr": 1.8163946463958276e-07, "epoch": 8.785276073619633, "percentage": 87.85, "elapsed_time": "1:42:19", "remaining_time": "0:14:08"} -{"current_steps": 1433, "total_steps": 1630, "loss": 0.0009, "lr": 1.7984068336086652e-07, "epoch": 8.791411042944786, "percentage": 87.91, "elapsed_time": "1:42:22", "remaining_time": "0:14:04"} -{"current_steps": 1434, "total_steps": 1630, "loss": 0.0014, "lr": 1.780505207909894e-07, "epoch": 8.797546012269938, "percentage": 87.98, "elapsed_time": "1:42:23", "remaining_time": "0:13:59"} -{"current_steps": 1435, "total_steps": 1630, "loss": 0.0013, "lr": 1.7626898357987782e-07, "epoch": 8.803680981595091, "percentage": 88.04, "elapsed_time": "1:42:25", "remaining_time": "0:13:55"} -{"current_steps": 1436, "total_steps": 1630, "loss": 0.0024, "lr": 1.744960783454186e-07, "epoch": 8.809815950920246, "percentage": 88.1, "elapsed_time": "1:42:29", "remaining_time": "0:13:50"} -{"current_steps": 1437, "total_steps": 1630, "loss": 0.0015, "lr": 1.727318116734328e-07, "epoch": 8.815950920245399, "percentage": 88.16, "elapsed_time": "1:42:30", "remaining_time": "0:13:46"} -{"current_steps": 1438, "total_steps": 1630, "loss": 0.0017, "lr": 1.7097619011765127e-07, "epoch": 8.822085889570552, "percentage": 88.22, "elapsed_time": "1:42:32", "remaining_time": "0:13:41"} -{"current_steps": 1439, "total_steps": 1630, "loss": 0.0009, "lr": 1.6922922019969145e-07, "epoch": 8.828220858895705, "percentage": 88.28, "elapsed_time": "1:42:33", "remaining_time": "0:13:36"} -{"current_steps": 1440, "total_steps": 1630, "loss": 0.0013, "lr": 1.6749090840903233e-07, "epoch": 8.83435582822086, "percentage": 88.34, "elapsed_time": "1:42:36", "remaining_time": "0:13:32"} -{"current_steps": 1441, "total_steps": 1630, "loss": 0.0029, "lr": 1.6576126120299046e-07, "epoch": 8.840490797546012, "percentage": 88.4, "elapsed_time": "1:42:37", "remaining_time": "0:13:27"} -{"current_steps": 1442, "total_steps": 1630, "loss": 0.0034, "lr": 1.6404028500669633e-07, "epoch": 8.846625766871165, "percentage": 88.47, "elapsed_time": "1:42:40", "remaining_time": "0:13:23"} -{"current_steps": 1443, "total_steps": 1630, "loss": 0.0022, "lr": 1.6232798621306918e-07, "epoch": 8.85276073619632, "percentage": 88.53, "elapsed_time": "1:42:42", "remaining_time": "0:13:18"} -{"current_steps": 1444, "total_steps": 1630, "loss": 0.0329, "lr": 1.606243711827951e-07, "epoch": 8.858895705521473, "percentage": 88.59, "elapsed_time": "1:42:43", "remaining_time": "0:13:13"} -{"current_steps": 1445, "total_steps": 1630, "loss": 0.0092, "lr": 1.5892944624430334e-07, "epoch": 8.865030674846626, "percentage": 88.65, "elapsed_time": "1:42:45", "remaining_time": "0:13:09"} -{"current_steps": 1446, "total_steps": 1630, "loss": 0.0005, "lr": 1.5724321769374023e-07, "epoch": 8.871165644171779, "percentage": 88.71, "elapsed_time": "1:42:47", "remaining_time": "0:13:04"} -{"current_steps": 1447, "total_steps": 1630, "loss": 0.0005, "lr": 1.5556569179494857e-07, "epoch": 8.877300613496933, "percentage": 88.77, "elapsed_time": "1:42:48", "remaining_time": "0:13:00"} -{"current_steps": 1448, "total_steps": 1630, "loss": 0.0004, "lr": 1.538968747794431e-07, "epoch": 8.883435582822086, "percentage": 88.83, "elapsed_time": "1:42:50", "remaining_time": "0:12:55"} -{"current_steps": 1449, "total_steps": 1630, "loss": 0.0046, "lr": 1.5223677284638805e-07, "epoch": 8.889570552147239, "percentage": 88.9, "elapsed_time": "1:42:56", "remaining_time": "0:12:51"} -{"current_steps": 1450, "total_steps": 1630, "loss": 0.0048, "lr": 1.5058539216257356e-07, "epoch": 8.895705521472392, "percentage": 88.96, "elapsed_time": "1:43:01", "remaining_time": "0:12:47"} -{"current_steps": 1451, "total_steps": 1630, "loss": 0.0027, "lr": 1.4894273886239208e-07, "epoch": 8.901840490797547, "percentage": 89.02, "elapsed_time": "1:43:02", "remaining_time": "0:12:42"} -{"current_steps": 1452, "total_steps": 1630, "loss": 0.0134, "lr": 1.473088190478178e-07, "epoch": 8.9079754601227, "percentage": 89.08, "elapsed_time": "1:43:06", "remaining_time": "0:12:38"} -{"current_steps": 1453, "total_steps": 1630, "loss": 0.0024, "lr": 1.4568363878838087e-07, "epoch": 8.914110429447852, "percentage": 89.14, "elapsed_time": "1:43:07", "remaining_time": "0:12:33"} -{"current_steps": 1454, "total_steps": 1630, "loss": 0.0019, "lr": 1.4406720412114828e-07, "epoch": 8.920245398773005, "percentage": 89.2, "elapsed_time": "1:43:11", "remaining_time": "0:12:29"} -{"current_steps": 1455, "total_steps": 1630, "loss": 0.0015, "lr": 1.4245952105069905e-07, "epoch": 8.92638036809816, "percentage": 89.26, "elapsed_time": "1:43:12", "remaining_time": "0:12:24"} -{"current_steps": 1456, "total_steps": 1630, "loss": 0.0045, "lr": 1.4086059554910186e-07, "epoch": 8.932515337423313, "percentage": 89.33, "elapsed_time": "1:43:16", "remaining_time": "0:12:20"} -{"current_steps": 1457, "total_steps": 1630, "loss": 0.0011, "lr": 1.3927043355589476e-07, "epoch": 8.938650306748466, "percentage": 89.39, "elapsed_time": "1:43:19", "remaining_time": "0:12:16"} -{"current_steps": 1458, "total_steps": 1630, "loss": 0.0019, "lr": 1.3768904097806153e-07, "epoch": 8.94478527607362, "percentage": 89.45, "elapsed_time": "1:43:21", "remaining_time": "0:12:11"} -{"current_steps": 1459, "total_steps": 1630, "loss": 0.0027, "lr": 1.361164236900092e-07, "epoch": 8.950920245398773, "percentage": 89.51, "elapsed_time": "1:43:23", "remaining_time": "0:12:07"} -{"current_steps": 1460, "total_steps": 1630, "loss": 0.0048, "lr": 1.3455258753354932e-07, "epoch": 8.957055214723926, "percentage": 89.57, "elapsed_time": "1:43:25", "remaining_time": "0:12:02"} -{"current_steps": 1461, "total_steps": 1630, "loss": 0.0011, "lr": 1.3299753831787193e-07, "epoch": 8.963190184049079, "percentage": 89.63, "elapsed_time": "1:43:26", "remaining_time": "0:11:57"} -{"current_steps": 1462, "total_steps": 1630, "loss": 0.0018, "lr": 1.3145128181952737e-07, "epoch": 8.969325153374234, "percentage": 89.69, "elapsed_time": "1:43:31", "remaining_time": "0:11:53"} -{"current_steps": 1463, "total_steps": 1630, "loss": 0.0032, "lr": 1.2991382378240325e-07, "epoch": 8.975460122699387, "percentage": 89.75, "elapsed_time": "1:43:33", "remaining_time": "0:11:49"} -{"current_steps": 1464, "total_steps": 1630, "loss": 0.001, "lr": 1.2838516991770355e-07, "epoch": 8.98159509202454, "percentage": 89.82, "elapsed_time": "1:43:35", "remaining_time": "0:11:44"} -{"current_steps": 1465, "total_steps": 1630, "loss": 0.0024, "lr": 1.2686532590392763e-07, "epoch": 8.987730061349692, "percentage": 89.88, "elapsed_time": "1:43:38", "remaining_time": "0:11:40"} -{"current_steps": 1466, "total_steps": 1630, "loss": 0.0007, "lr": 1.2535429738684822e-07, "epoch": 8.993865030674847, "percentage": 89.94, "elapsed_time": "1:43:39", "remaining_time": "0:11:35"} -{"current_steps": 1467, "total_steps": 1630, "loss": 0.0245, "lr": 1.238520899794915e-07, "epoch": 9.0, "percentage": 90.0, "elapsed_time": "1:43:41", "remaining_time": "0:11:31"} -{"current_steps": 1468, "total_steps": 1630, "loss": 0.0006, "lr": 1.223587092621162e-07, "epoch": 9.006134969325153, "percentage": 90.06, "elapsed_time": "1:47:05", "remaining_time": "0:11:49"} -{"current_steps": 1469, "total_steps": 1630, "loss": 0.0005, "lr": 1.2087416078219144e-07, "epoch": 9.012269938650308, "percentage": 90.12, "elapsed_time": "1:47:07", "remaining_time": "0:11:44"} -{"current_steps": 1470, "total_steps": 1630, "loss": 0.0006, "lr": 1.1939845005437823e-07, "epoch": 9.01840490797546, "percentage": 90.18, "elapsed_time": "1:47:08", "remaining_time": "0:11:39"} -{"current_steps": 1471, "total_steps": 1630, "loss": 0.0004, "lr": 1.1793158256050708e-07, "epoch": 9.024539877300613, "percentage": 90.25, "elapsed_time": "1:47:11", "remaining_time": "0:11:35"} -{"current_steps": 1472, "total_steps": 1630, "loss": 0.0005, "lr": 1.1647356374955926e-07, "epoch": 9.030674846625766, "percentage": 90.31, "elapsed_time": "1:47:14", "remaining_time": "0:11:30"} -{"current_steps": 1473, "total_steps": 1630, "loss": 0.0012, "lr": 1.1502439903764539e-07, "epoch": 9.036809815950921, "percentage": 90.37, "elapsed_time": "1:47:17", "remaining_time": "0:11:26"} -{"current_steps": 1474, "total_steps": 1630, "loss": 0.0004, "lr": 1.1358409380798547e-07, "epoch": 9.042944785276074, "percentage": 90.43, "elapsed_time": "1:47:18", "remaining_time": "0:11:21"} -{"current_steps": 1475, "total_steps": 1630, "loss": 0.0031, "lr": 1.1215265341089021e-07, "epoch": 9.049079754601227, "percentage": 90.49, "elapsed_time": "1:47:24", "remaining_time": "0:11:17"} -{"current_steps": 1476, "total_steps": 1630, "loss": 0.0004, "lr": 1.1073008316373812e-07, "epoch": 9.05521472392638, "percentage": 90.55, "elapsed_time": "1:47:25", "remaining_time": "0:11:12"} -{"current_steps": 1477, "total_steps": 1630, "loss": 0.0056, "lr": 1.093163883509596e-07, "epoch": 9.061349693251534, "percentage": 90.61, "elapsed_time": "1:47:28", "remaining_time": "0:11:08"} -{"current_steps": 1478, "total_steps": 1630, "loss": 0.0006, "lr": 1.0791157422401499e-07, "epoch": 9.067484662576687, "percentage": 90.67, "elapsed_time": "1:47:32", "remaining_time": "0:11:03"} -{"current_steps": 1479, "total_steps": 1630, "loss": 0.0013, "lr": 1.0651564600137443e-07, "epoch": 9.07361963190184, "percentage": 90.74, "elapsed_time": "1:47:34", "remaining_time": "0:10:58"} -{"current_steps": 1480, "total_steps": 1630, "loss": 0.0004, "lr": 1.051286088685008e-07, "epoch": 9.079754601226995, "percentage": 90.8, "elapsed_time": "1:47:36", "remaining_time": "0:10:54"} -{"current_steps": 1481, "total_steps": 1630, "loss": 0.0047, "lr": 1.0375046797782868e-07, "epoch": 9.085889570552148, "percentage": 90.86, "elapsed_time": "1:47:42", "remaining_time": "0:10:50"} -{"current_steps": 1482, "total_steps": 1630, "loss": 0.0004, "lr": 1.0238122844874576e-07, "epoch": 9.0920245398773, "percentage": 90.92, "elapsed_time": "1:47:45", "remaining_time": "0:10:45"} -{"current_steps": 1483, "total_steps": 1630, "loss": 0.0011, "lr": 1.0102089536757398e-07, "epoch": 9.098159509202453, "percentage": 90.98, "elapsed_time": "1:47:49", "remaining_time": "0:10:41"} -{"current_steps": 1484, "total_steps": 1630, "loss": 0.0011, "lr": 9.966947378754949e-08, "epoch": 9.104294478527608, "percentage": 91.04, "elapsed_time": "1:47:52", "remaining_time": "0:10:36"} -{"current_steps": 1485, "total_steps": 1630, "loss": 0.0007, "lr": 9.83269687288066e-08, "epoch": 9.110429447852761, "percentage": 91.1, "elapsed_time": "1:47:56", "remaining_time": "0:10:32"} -{"current_steps": 1486, "total_steps": 1630, "loss": 0.0005, "lr": 9.699338517835611e-08, "epoch": 9.116564417177914, "percentage": 91.17, "elapsed_time": "1:47:57", "remaining_time": "0:10:27"} -{"current_steps": 1487, "total_steps": 1630, "loss": 0.0004, "lr": 9.566872809006783e-08, "epoch": 9.122699386503067, "percentage": 91.23, "elapsed_time": "1:47:58", "remaining_time": "0:10:23"} -{"current_steps": 1488, "total_steps": 1630, "loss": 0.0007, "lr": 9.435300238465339e-08, "epoch": 9.128834355828221, "percentage": 91.29, "elapsed_time": "1:48:00", "remaining_time": "0:10:18"} -{"current_steps": 1489, "total_steps": 1630, "loss": 0.0003, "lr": 9.30462129496465e-08, "epoch": 9.134969325153374, "percentage": 91.35, "elapsed_time": "1:48:02", "remaining_time": "0:10:13"} -{"current_steps": 1490, "total_steps": 1630, "loss": 0.0011, "lr": 9.174836463938464e-08, "epoch": 9.141104294478527, "percentage": 91.41, "elapsed_time": "1:48:08", "remaining_time": "0:10:09"} -{"current_steps": 1491, "total_steps": 1630, "loss": 0.0012, "lr": 9.045946227499298e-08, "epoch": 9.14723926380368, "percentage": 91.47, "elapsed_time": "1:48:10", "remaining_time": "0:10:05"} -{"current_steps": 1492, "total_steps": 1630, "loss": 0.0015, "lr": 8.917951064436382e-08, "epoch": 9.153374233128835, "percentage": 91.53, "elapsed_time": "1:48:12", "remaining_time": "0:10:00"} -{"current_steps": 1493, "total_steps": 1630, "loss": 0.0009, "lr": 8.790851450214106e-08, "epoch": 9.159509202453988, "percentage": 91.6, "elapsed_time": "1:48:14", "remaining_time": "0:09:55"} -{"current_steps": 1494, "total_steps": 1630, "loss": 0.0007, "lr": 8.664647856970076e-08, "epoch": 9.16564417177914, "percentage": 91.66, "elapsed_time": "1:48:15", "remaining_time": "0:09:51"} -{"current_steps": 1495, "total_steps": 1630, "loss": 0.0046, "lr": 8.539340753513508e-08, "epoch": 9.171779141104295, "percentage": 91.72, "elapsed_time": "1:48:22", "remaining_time": "0:09:47"} -{"current_steps": 1496, "total_steps": 1630, "loss": 0.0442, "lr": 8.414930605323445e-08, "epoch": 9.177914110429448, "percentage": 91.78, "elapsed_time": "1:48:25", "remaining_time": "0:09:42"} -{"current_steps": 1497, "total_steps": 1630, "loss": 0.0019, "lr": 8.291417874546875e-08, "epoch": 9.184049079754601, "percentage": 91.84, "elapsed_time": "1:48:28", "remaining_time": "0:09:38"} -{"current_steps": 1498, "total_steps": 1630, "loss": 0.0009, "lr": 8.168803019997312e-08, "epoch": 9.190184049079754, "percentage": 91.9, "elapsed_time": "1:48:30", "remaining_time": "0:09:33"} -{"current_steps": 1499, "total_steps": 1630, "loss": 0.0004, "lr": 8.047086497152801e-08, "epoch": 9.196319018404909, "percentage": 91.96, "elapsed_time": "1:48:33", "remaining_time": "0:09:29"} -{"current_steps": 1500, "total_steps": 1630, "loss": 0.0014, "lr": 7.926268758154416e-08, "epoch": 9.202453987730062, "percentage": 92.02, "elapsed_time": "1:48:37", "remaining_time": "0:09:24"} -{"current_steps": 1501, "total_steps": 1630, "loss": 0.001, "lr": 7.806350251804484e-08, "epoch": 9.208588957055214, "percentage": 92.09, "elapsed_time": "1:48:38", "remaining_time": "0:09:20"} -{"current_steps": 1502, "total_steps": 1630, "loss": 0.0006, "lr": 7.687331423564925e-08, "epoch": 9.214723926380367, "percentage": 92.15, "elapsed_time": "1:48:40", "remaining_time": "0:09:15"} -{"current_steps": 1503, "total_steps": 1630, "loss": 0.0005, "lr": 7.569212715555663e-08, "epoch": 9.220858895705522, "percentage": 92.21, "elapsed_time": "1:48:44", "remaining_time": "0:09:11"} -{"current_steps": 1504, "total_steps": 1630, "loss": 0.0006, "lr": 7.451994566552989e-08, "epoch": 9.226993865030675, "percentage": 92.27, "elapsed_time": "1:48:46", "remaining_time": "0:09:06"} -{"current_steps": 1505, "total_steps": 1630, "loss": 0.0006, "lr": 7.335677411987734e-08, "epoch": 9.233128834355828, "percentage": 92.33, "elapsed_time": "1:48:47", "remaining_time": "0:09:02"} -{"current_steps": 1506, "total_steps": 1630, "loss": 0.0037, "lr": 7.220261683943935e-08, "epoch": 9.239263803680982, "percentage": 92.39, "elapsed_time": "1:48:52", "remaining_time": "0:08:57"} -{"current_steps": 1507, "total_steps": 1630, "loss": 0.001, "lr": 7.105747811156999e-08, "epoch": 9.245398773006135, "percentage": 92.45, "elapsed_time": "1:48:55", "remaining_time": "0:08:53"} -{"current_steps": 1508, "total_steps": 1630, "loss": 0.0008, "lr": 6.992136219012263e-08, "epoch": 9.251533742331288, "percentage": 92.52, "elapsed_time": "1:48:57", "remaining_time": "0:08:48"} -{"current_steps": 1509, "total_steps": 1630, "loss": 0.001, "lr": 6.879427329543414e-08, "epoch": 9.257668711656441, "percentage": 92.58, "elapsed_time": "1:49:00", "remaining_time": "0:08:44"} -{"current_steps": 1510, "total_steps": 1630, "loss": 0.0014, "lr": 6.76762156143071e-08, "epoch": 9.263803680981596, "percentage": 92.64, "elapsed_time": "1:49:03", "remaining_time": "0:08:39"} -{"current_steps": 1511, "total_steps": 1630, "loss": 0.0003, "lr": 6.6567193299997e-08, "epoch": 9.269938650306749, "percentage": 92.7, "elapsed_time": "1:49:04", "remaining_time": "0:08:35"} -{"current_steps": 1512, "total_steps": 1630, "loss": 0.0003, "lr": 6.546721047219568e-08, "epoch": 9.276073619631902, "percentage": 92.76, "elapsed_time": "1:49:05", "remaining_time": "0:08:30"} -{"current_steps": 1513, "total_steps": 1630, "loss": 0.0007, "lr": 6.437627121701456e-08, "epoch": 9.282208588957054, "percentage": 92.82, "elapsed_time": "1:49:07", "remaining_time": "0:08:26"} -{"current_steps": 1514, "total_steps": 1630, "loss": 0.0005, "lr": 6.329437958697282e-08, "epoch": 9.28834355828221, "percentage": 92.88, "elapsed_time": "1:49:11", "remaining_time": "0:08:21"} -{"current_steps": 1515, "total_steps": 1630, "loss": 0.0004, "lr": 6.222153960097871e-08, "epoch": 9.294478527607362, "percentage": 92.94, "elapsed_time": "1:49:13", "remaining_time": "0:08:17"} -{"current_steps": 1516, "total_steps": 1630, "loss": 0.0004, "lr": 6.115775524431711e-08, "epoch": 9.300613496932515, "percentage": 93.01, "elapsed_time": "1:49:14", "remaining_time": "0:08:12"} -{"current_steps": 1517, "total_steps": 1630, "loss": 0.0008, "lr": 6.010303046863397e-08, "epoch": 9.30674846625767, "percentage": 93.07, "elapsed_time": "1:49:16", "remaining_time": "0:08:08"} -{"current_steps": 1518, "total_steps": 1630, "loss": 0.0044, "lr": 5.905736919192107e-08, "epoch": 9.312883435582823, "percentage": 93.13, "elapsed_time": "1:49:19", "remaining_time": "0:08:03"} -{"current_steps": 1519, "total_steps": 1630, "loss": 0.0016, "lr": 5.8020775298502085e-08, "epoch": 9.319018404907975, "percentage": 93.19, "elapsed_time": "1:49:22", "remaining_time": "0:07:59"} -{"current_steps": 1520, "total_steps": 1630, "loss": 0.0004, "lr": 5.699325263901878e-08, "epoch": 9.325153374233128, "percentage": 93.25, "elapsed_time": "1:49:26", "remaining_time": "0:07:55"} -{"current_steps": 1521, "total_steps": 1630, "loss": 0.0016, "lr": 5.597480503041486e-08, "epoch": 9.331288343558283, "percentage": 93.31, "elapsed_time": "1:49:31", "remaining_time": "0:07:50"} -{"current_steps": 1522, "total_steps": 1630, "loss": 0.0006, "lr": 5.496543625592321e-08, "epoch": 9.337423312883436, "percentage": 93.37, "elapsed_time": "1:49:33", "remaining_time": "0:07:46"} -{"current_steps": 1523, "total_steps": 1630, "loss": 0.001, "lr": 5.396515006505204e-08, "epoch": 9.343558282208589, "percentage": 93.44, "elapsed_time": "1:49:37", "remaining_time": "0:07:42"} -{"current_steps": 1524, "total_steps": 1630, "loss": 0.0004, "lr": 5.297395017357015e-08, "epoch": 9.349693251533742, "percentage": 93.5, "elapsed_time": "1:49:39", "remaining_time": "0:07:37"} -{"current_steps": 1525, "total_steps": 1630, "loss": 0.0005, "lr": 5.199184026349308e-08, "epoch": 9.355828220858896, "percentage": 93.56, "elapsed_time": "1:49:41", "remaining_time": "0:07:33"} -{"current_steps": 1526, "total_steps": 1630, "loss": 0.0014, "lr": 5.1018823983070285e-08, "epoch": 9.36196319018405, "percentage": 93.62, "elapsed_time": "1:49:43", "remaining_time": "0:07:28"} -{"current_steps": 1527, "total_steps": 1630, "loss": 0.0009, "lr": 5.005490494677051e-08, "epoch": 9.368098159509202, "percentage": 93.68, "elapsed_time": "1:49:45", "remaining_time": "0:07:24"} -{"current_steps": 1528, "total_steps": 1630, "loss": 0.0006, "lr": 4.91000867352695e-08, "epoch": 9.374233128834355, "percentage": 93.74, "elapsed_time": "1:49:47", "remaining_time": "0:07:19"} -{"current_steps": 1529, "total_steps": 1630, "loss": 0.0006, "lr": 4.815437289543562e-08, "epoch": 9.38036809815951, "percentage": 93.8, "elapsed_time": "1:49:51", "remaining_time": "0:07:15"} -{"current_steps": 1530, "total_steps": 1630, "loss": 0.0018, "lr": 4.7217766940317326e-08, "epoch": 9.386503067484663, "percentage": 93.87, "elapsed_time": "1:49:57", "remaining_time": "0:07:11"} -{"current_steps": 1531, "total_steps": 1630, "loss": 0.0007, "lr": 4.629027234912986e-08, "epoch": 9.392638036809815, "percentage": 93.93, "elapsed_time": "1:50:01", "remaining_time": "0:07:06"} -{"current_steps": 1532, "total_steps": 1630, "loss": 0.0004, "lr": 4.5371892567243336e-08, "epoch": 9.39877300613497, "percentage": 93.99, "elapsed_time": "1:50:02", "remaining_time": "0:07:02"} -{"current_steps": 1533, "total_steps": 1630, "loss": 0.0169, "lr": 4.4462631006167714e-08, "epoch": 9.404907975460123, "percentage": 94.05, "elapsed_time": "1:50:07", "remaining_time": "0:06:58"} -{"current_steps": 1534, "total_steps": 1630, "loss": 0.0005, "lr": 4.356249104354199e-08, "epoch": 9.411042944785276, "percentage": 94.11, "elapsed_time": "1:50:08", "remaining_time": "0:06:53"} -{"current_steps": 1535, "total_steps": 1630, "loss": 0.0018, "lr": 4.267147602312116e-08, "epoch": 9.417177914110429, "percentage": 94.17, "elapsed_time": "1:50:10", "remaining_time": "0:06:49"} -{"current_steps": 1536, "total_steps": 1630, "loss": 0.0028, "lr": 4.178958925476401e-08, "epoch": 9.423312883435583, "percentage": 94.23, "elapsed_time": "1:50:13", "remaining_time": "0:06:44"} -{"current_steps": 1537, "total_steps": 1630, "loss": 0.0124, "lr": 4.0916834014420036e-08, "epoch": 9.429447852760736, "percentage": 94.29, "elapsed_time": "1:50:16", "remaining_time": "0:06:40"} -{"current_steps": 1538, "total_steps": 1630, "loss": 0.0131, "lr": 4.0053213544118116e-08, "epoch": 9.43558282208589, "percentage": 94.36, "elapsed_time": "1:50:19", "remaining_time": "0:06:35"} -{"current_steps": 1539, "total_steps": 1630, "loss": 0.0014, "lr": 3.919873105195371e-08, "epoch": 9.441717791411042, "percentage": 94.42, "elapsed_time": "1:50:20", "remaining_time": "0:06:31"} -{"current_steps": 1540, "total_steps": 1630, "loss": 0.0012, "lr": 3.8353389712078583e-08, "epoch": 9.447852760736197, "percentage": 94.48, "elapsed_time": "1:50:23", "remaining_time": "0:06:27"} -{"current_steps": 1541, "total_steps": 1630, "loss": 0.0102, "lr": 3.7517192664685844e-08, "epoch": 9.45398773006135, "percentage": 94.54, "elapsed_time": "1:50:26", "remaining_time": "0:06:22"} -{"current_steps": 1542, "total_steps": 1630, "loss": 0.0006, "lr": 3.6690143016002155e-08, "epoch": 9.460122699386503, "percentage": 94.6, "elapsed_time": "1:50:28", "remaining_time": "0:06:18"} -{"current_steps": 1543, "total_steps": 1630, "loss": 0.0035, "lr": 3.587224383827331e-08, "epoch": 9.466257668711656, "percentage": 94.66, "elapsed_time": "1:50:32", "remaining_time": "0:06:13"} -{"current_steps": 1544, "total_steps": 1630, "loss": 0.0015, "lr": 3.506349816975368e-08, "epoch": 9.47239263803681, "percentage": 94.72, "elapsed_time": "1:50:33", "remaining_time": "0:06:09"} -{"current_steps": 1545, "total_steps": 1630, "loss": 0.0004, "lr": 3.426390901469595e-08, "epoch": 9.478527607361963, "percentage": 94.79, "elapsed_time": "1:50:35", "remaining_time": "0:06:05"} -{"current_steps": 1546, "total_steps": 1630, "loss": 0.0015, "lr": 3.347347934333778e-08, "epoch": 9.484662576687116, "percentage": 94.85, "elapsed_time": "1:50:37", "remaining_time": "0:06:00"} -{"current_steps": 1547, "total_steps": 1630, "loss": 0.012, "lr": 3.2692212091893215e-08, "epoch": 9.49079754601227, "percentage": 94.91, "elapsed_time": "1:50:43", "remaining_time": "0:05:56"} -{"current_steps": 1548, "total_steps": 1630, "loss": 0.0003, "lr": 3.192011016253965e-08, "epoch": 9.496932515337424, "percentage": 94.97, "elapsed_time": "1:50:45", "remaining_time": "0:05:52"} -{"current_steps": 1549, "total_steps": 1630, "loss": 0.0004, "lr": 3.115717642340893e-08, "epoch": 9.503067484662576, "percentage": 95.03, "elapsed_time": "1:50:47", "remaining_time": "0:05:47"} -{"current_steps": 1550, "total_steps": 1630, "loss": 0.0008, "lr": 3.040341370857486e-08, "epoch": 9.50920245398773, "percentage": 95.09, "elapsed_time": "1:50:49", "remaining_time": "0:05:43"} -{"current_steps": 1551, "total_steps": 1630, "loss": 0.0005, "lr": 2.9658824818044328e-08, "epoch": 9.515337423312884, "percentage": 95.15, "elapsed_time": "1:50:53", "remaining_time": "0:05:38"} -{"current_steps": 1552, "total_steps": 1630, "loss": 0.0002, "lr": 2.8923412517745662e-08, "epoch": 9.521472392638037, "percentage": 95.21, "elapsed_time": "1:50:55", "remaining_time": "0:05:34"} -{"current_steps": 1553, "total_steps": 1630, "loss": 0.0004, "lr": 2.819717953951917e-08, "epoch": 9.52760736196319, "percentage": 95.28, "elapsed_time": "1:50:57", "remaining_time": "0:05:30"} -{"current_steps": 1554, "total_steps": 1630, "loss": 0.0065, "lr": 2.7480128581106602e-08, "epoch": 9.533742331288344, "percentage": 95.34, "elapsed_time": "1:51:03", "remaining_time": "0:05:25"} -{"current_steps": 1555, "total_steps": 1630, "loss": 0.0018, "lr": 2.6772262306141438e-08, "epoch": 9.539877300613497, "percentage": 95.4, "elapsed_time": "1:51:05", "remaining_time": "0:05:21"} -{"current_steps": 1556, "total_steps": 1630, "loss": 0.0006, "lr": 2.607358334413779e-08, "epoch": 9.54601226993865, "percentage": 95.46, "elapsed_time": "1:51:06", "remaining_time": "0:05:17"} -{"current_steps": 1557, "total_steps": 1630, "loss": 0.0006, "lr": 2.5384094290482886e-08, "epoch": 9.552147239263803, "percentage": 95.52, "elapsed_time": "1:51:08", "remaining_time": "0:05:12"} -{"current_steps": 1558, "total_steps": 1630, "loss": 0.0068, "lr": 2.4703797706425725e-08, "epoch": 9.558282208588958, "percentage": 95.58, "elapsed_time": "1:51:15", "remaining_time": "0:05:08"} -{"current_steps": 1559, "total_steps": 1630, "loss": 0.0014, "lr": 2.4032696119067332e-08, "epoch": 9.56441717791411, "percentage": 95.64, "elapsed_time": "1:51:17", "remaining_time": "0:05:04"} -{"current_steps": 1560, "total_steps": 1630, "loss": 0.0011, "lr": 2.337079202135273e-08, "epoch": 9.570552147239264, "percentage": 95.71, "elapsed_time": "1:51:19", "remaining_time": "0:04:59"} -{"current_steps": 1561, "total_steps": 1630, "loss": 0.0025, "lr": 2.2718087872060925e-08, "epoch": 9.576687116564417, "percentage": 95.77, "elapsed_time": "1:51:21", "remaining_time": "0:04:55"} -{"current_steps": 1562, "total_steps": 1630, "loss": 0.0021, "lr": 2.207458609579549e-08, "epoch": 9.582822085889571, "percentage": 95.83, "elapsed_time": "1:51:23", "remaining_time": "0:04:50"} -{"current_steps": 1563, "total_steps": 1630, "loss": 0.0007, "lr": 2.144028908297624e-08, "epoch": 9.588957055214724, "percentage": 95.89, "elapsed_time": "1:51:25", "remaining_time": "0:04:46"} -{"current_steps": 1564, "total_steps": 1630, "loss": 0.0014, "lr": 2.081519918982977e-08, "epoch": 9.595092024539877, "percentage": 95.95, "elapsed_time": "1:51:27", "remaining_time": "0:04:42"} -{"current_steps": 1565, "total_steps": 1630, "loss": 0.0008, "lr": 2.019931873838088e-08, "epoch": 9.60122699386503, "percentage": 96.01, "elapsed_time": "1:51:29", "remaining_time": "0:04:37"} -{"current_steps": 1566, "total_steps": 1630, "loss": 0.0005, "lr": 1.9592650016444503e-08, "epoch": 9.607361963190185, "percentage": 96.07, "elapsed_time": "1:51:31", "remaining_time": "0:04:33"} -{"current_steps": 1567, "total_steps": 1630, "loss": 0.0004, "lr": 1.8995195277616284e-08, "epoch": 9.613496932515337, "percentage": 96.13, "elapsed_time": "1:51:33", "remaining_time": "0:04:29"} -{"current_steps": 1568, "total_steps": 1630, "loss": 0.0004, "lr": 1.8406956741264247e-08, "epoch": 9.61963190184049, "percentage": 96.2, "elapsed_time": "1:51:36", "remaining_time": "0:04:24"} -{"current_steps": 1569, "total_steps": 1630, "loss": 0.0014, "lr": 1.7827936592521856e-08, "epoch": 9.625766871165645, "percentage": 96.26, "elapsed_time": "1:51:39", "remaining_time": "0:04:20"} -{"current_steps": 1570, "total_steps": 1630, "loss": 0.0009, "lr": 1.7258136982278296e-08, "epoch": 9.631901840490798, "percentage": 96.32, "elapsed_time": "1:51:40", "remaining_time": "0:04:16"} -{"current_steps": 1571, "total_steps": 1630, "loss": 0.0019, "lr": 1.6697560027171543e-08, "epoch": 9.63803680981595, "percentage": 96.38, "elapsed_time": "1:51:43", "remaining_time": "0:04:11"} -{"current_steps": 1572, "total_steps": 1630, "loss": 0.0072, "lr": 1.6146207809579762e-08, "epoch": 9.644171779141104, "percentage": 96.44, "elapsed_time": "1:51:46", "remaining_time": "0:04:07"} -{"current_steps": 1573, "total_steps": 1630, "loss": 0.001, "lr": 1.5604082377614072e-08, "epoch": 9.650306748466258, "percentage": 96.5, "elapsed_time": "1:51:50", "remaining_time": "0:04:03"} -{"current_steps": 1574, "total_steps": 1630, "loss": 0.0017, "lr": 1.507118574511135e-08, "epoch": 9.656441717791411, "percentage": 96.56, "elapsed_time": "1:51:53", "remaining_time": "0:03:58"} -{"current_steps": 1575, "total_steps": 1630, "loss": 0.0106, "lr": 1.454751989162506e-08, "epoch": 9.662576687116564, "percentage": 96.63, "elapsed_time": "1:51:59", "remaining_time": "0:03:54"} -{"current_steps": 1576, "total_steps": 1630, "loss": 0.0004, "lr": 1.4033086762419989e-08, "epoch": 9.668711656441717, "percentage": 96.69, "elapsed_time": "1:52:02", "remaining_time": "0:03:50"} -{"current_steps": 1577, "total_steps": 1630, "loss": 0.002, "lr": 1.3527888268463907e-08, "epoch": 9.674846625766872, "percentage": 96.75, "elapsed_time": "1:52:08", "remaining_time": "0:03:46"} -{"current_steps": 1578, "total_steps": 1630, "loss": 0.0007, "lr": 1.303192628642036e-08, "epoch": 9.680981595092025, "percentage": 96.81, "elapsed_time": "1:52:10", "remaining_time": "0:03:41"} -{"current_steps": 1579, "total_steps": 1630, "loss": 0.0004, "lr": 1.2545202658642008e-08, "epoch": 9.687116564417177, "percentage": 96.87, "elapsed_time": "1:52:14", "remaining_time": "0:03:37"} -{"current_steps": 1580, "total_steps": 1630, "loss": 0.0005, "lr": 1.2067719193163962e-08, "epoch": 9.69325153374233, "percentage": 96.93, "elapsed_time": "1:52:16", "remaining_time": "0:03:33"} -{"current_steps": 1581, "total_steps": 1630, "loss": 0.0062, "lr": 1.1599477663696845e-08, "epoch": 9.699386503067485, "percentage": 96.99, "elapsed_time": "1:52:18", "remaining_time": "0:03:28"} -{"current_steps": 1582, "total_steps": 1630, "loss": 0.0005, "lr": 1.1140479809619576e-08, "epoch": 9.705521472392638, "percentage": 97.06, "elapsed_time": "1:52:20", "remaining_time": "0:03:24"} -{"current_steps": 1583, "total_steps": 1630, "loss": 0.0007, "lr": 1.069072733597465e-08, "epoch": 9.71165644171779, "percentage": 97.12, "elapsed_time": "1:52:24", "remaining_time": "0:03:20"} -{"current_steps": 1584, "total_steps": 1630, "loss": 0.0003, "lr": 1.025022191346009e-08, "epoch": 9.717791411042946, "percentage": 97.18, "elapsed_time": "1:52:25", "remaining_time": "0:03:15"} -{"current_steps": 1585, "total_steps": 1630, "loss": 0.0083, "lr": 9.818965178423345e-09, "epoch": 9.723926380368098, "percentage": 97.24, "elapsed_time": "1:52:28", "remaining_time": "0:03:11"} -{"current_steps": 1586, "total_steps": 1630, "loss": 0.001, "lr": 9.396958732856843e-09, "epoch": 9.730061349693251, "percentage": 97.3, "elapsed_time": "1:52:30", "remaining_time": "0:03:07"} -{"current_steps": 1587, "total_steps": 1630, "loss": 0.0006, "lr": 8.984204144389941e-09, "epoch": 9.736196319018404, "percentage": 97.36, "elapsed_time": "1:52:31", "remaining_time": "0:03:02"} -{"current_steps": 1588, "total_steps": 1630, "loss": 0.0109, "lr": 8.580702946284491e-09, "epoch": 9.742331288343559, "percentage": 97.42, "elapsed_time": "1:52:35", "remaining_time": "0:02:58"} -{"current_steps": 1589, "total_steps": 1630, "loss": 0.0016, "lr": 8.186456637428453e-09, "epoch": 9.748466257668712, "percentage": 97.48, "elapsed_time": "1:52:39", "remaining_time": "0:02:54"} -{"current_steps": 1590, "total_steps": 1630, "loss": 0.0004, "lr": 7.801466682331172e-09, "epoch": 9.754601226993865, "percentage": 97.55, "elapsed_time": "1:52:39", "remaining_time": "0:02:50"} -{"current_steps": 1591, "total_steps": 1630, "loss": 0.008, "lr": 7.425734511117e-09, "epoch": 9.76073619631902, "percentage": 97.61, "elapsed_time": "1:52:43", "remaining_time": "0:02:45"} -{"current_steps": 1592, "total_steps": 1630, "loss": 0.003, "lr": 7.059261519520022e-09, "epoch": 9.766871165644172, "percentage": 97.67, "elapsed_time": "1:52:45", "remaining_time": "0:02:41"} -{"current_steps": 1593, "total_steps": 1630, "loss": 0.0009, "lr": 6.702049068879613e-09, "epoch": 9.773006134969325, "percentage": 97.73, "elapsed_time": "1:52:46", "remaining_time": "0:02:37"} -{"current_steps": 1594, "total_steps": 1630, "loss": 0.0049, "lr": 6.354098486135163e-09, "epoch": 9.779141104294478, "percentage": 97.79, "elapsed_time": "1:52:53", "remaining_time": "0:02:32"} -{"current_steps": 1595, "total_steps": 1630, "loss": 0.0003, "lr": 6.015411063820253e-09, "epoch": 9.785276073619633, "percentage": 97.85, "elapsed_time": "1:52:55", "remaining_time": "0:02:28"} -{"current_steps": 1596, "total_steps": 1630, "loss": 0.001, "lr": 5.685988060059045e-09, "epoch": 9.791411042944786, "percentage": 97.91, "elapsed_time": "1:52:58", "remaining_time": "0:02:24"} -{"current_steps": 1597, "total_steps": 1630, "loss": 0.0012, "lr": 5.36583069856128e-09, "epoch": 9.797546012269938, "percentage": 97.98, "elapsed_time": "1:53:01", "remaining_time": "0:02:20"} -{"current_steps": 1598, "total_steps": 1630, "loss": 0.0005, "lr": 5.054940168617018e-09, "epoch": 9.803680981595091, "percentage": 98.04, "elapsed_time": "1:53:05", "remaining_time": "0:02:15"} -{"current_steps": 1599, "total_steps": 1630, "loss": 0.0002, "lr": 4.753317625093013e-09, "epoch": 9.809815950920246, "percentage": 98.1, "elapsed_time": "1:53:07", "remaining_time": "0:02:11"} -{"current_steps": 1600, "total_steps": 1630, "loss": 0.0007, "lr": 4.4609641884285625e-09, "epoch": 9.815950920245399, "percentage": 98.16, "elapsed_time": "1:53:10", "remaining_time": "0:02:07"} -{"current_steps": 1601, "total_steps": 1630, "loss": 0.0004, "lr": 4.17788094463023e-09, "epoch": 9.822085889570552, "percentage": 98.22, "elapsed_time": "1:53:13", "remaining_time": "0:02:03"} -{"current_steps": 1602, "total_steps": 1630, "loss": 0.0006, "lr": 3.904068945269346e-09, "epoch": 9.828220858895705, "percentage": 98.28, "elapsed_time": "1:53:14", "remaining_time": "0:01:58"} -{"current_steps": 1603, "total_steps": 1630, "loss": 0.0015, "lr": 3.639529207476733e-09, "epoch": 9.83435582822086, "percentage": 98.34, "elapsed_time": "1:53:16", "remaining_time": "0:01:54"} -{"current_steps": 1604, "total_steps": 1630, "loss": 0.0004, "lr": 3.384262713939379e-09, "epoch": 9.840490797546012, "percentage": 98.4, "elapsed_time": "1:53:20", "remaining_time": "0:01:50"} -{"current_steps": 1605, "total_steps": 1630, "loss": 0.0004, "lr": 3.1382704128973818e-09, "epoch": 9.846625766871165, "percentage": 98.47, "elapsed_time": "1:53:23", "remaining_time": "0:01:45"} -{"current_steps": 1606, "total_steps": 1630, "loss": 0.0007, "lr": 2.9015532181397854e-09, "epoch": 9.85276073619632, "percentage": 98.53, "elapsed_time": "1:53:24", "remaining_time": "0:01:41"} -{"current_steps": 1607, "total_steps": 1630, "loss": 0.0021, "lr": 2.674112009000973e-09, "epoch": 9.858895705521473, "percentage": 98.59, "elapsed_time": "1:53:30", "remaining_time": "0:01:37"} -{"current_steps": 1608, "total_steps": 1630, "loss": 0.0004, "lr": 2.4559476303584463e-09, "epoch": 9.865030674846626, "percentage": 98.65, "elapsed_time": "1:53:31", "remaining_time": "0:01:33"} -{"current_steps": 1609, "total_steps": 1630, "loss": 0.0004, "lr": 2.2470608926283833e-09, "epoch": 9.871165644171779, "percentage": 98.71, "elapsed_time": "1:53:34", "remaining_time": "0:01:28"} -{"current_steps": 1610, "total_steps": 1630, "loss": 0.0008, "lr": 2.0474525717639747e-09, "epoch": 9.877300613496933, "percentage": 98.77, "elapsed_time": "1:53:38", "remaining_time": "0:01:24"} -{"current_steps": 1611, "total_steps": 1630, "loss": 0.0035, "lr": 1.857123409250705e-09, "epoch": 9.883435582822086, "percentage": 98.83, "elapsed_time": "1:53:41", "remaining_time": "0:01:20"} -{"current_steps": 1612, "total_steps": 1630, "loss": 0.008, "lr": 1.6760741121057966e-09, "epoch": 9.889570552147239, "percentage": 98.9, "elapsed_time": "1:53:45", "remaining_time": "0:01:16"} -{"current_steps": 1613, "total_steps": 1630, "loss": 0.0009, "lr": 1.504305352874047e-09, "epoch": 9.895705521472392, "percentage": 98.96, "elapsed_time": "1:53:47", "remaining_time": "0:01:11"} -{"current_steps": 1614, "total_steps": 1630, "loss": 0.0003, "lr": 1.3418177696256086e-09, "epoch": 9.901840490797547, "percentage": 99.02, "elapsed_time": "1:53:49", "remaining_time": "0:01:07"} -{"current_steps": 1615, "total_steps": 1630, "loss": 0.0005, "lr": 1.1886119659543227e-09, "epoch": 9.9079754601227, "percentage": 99.08, "elapsed_time": "1:53:50", "remaining_time": "0:01:03"} -{"current_steps": 1616, "total_steps": 1630, "loss": 0.0008, "lr": 1.0446885109746673e-09, "epoch": 9.914110429447852, "percentage": 99.14, "elapsed_time": "1:53:54", "remaining_time": "0:00:59"} -{"current_steps": 1617, "total_steps": 1630, "loss": 0.0033, "lr": 9.100479393195361e-10, "epoch": 9.920245398773005, "percentage": 99.2, "elapsed_time": "1:53:59", "remaining_time": "0:00:54"} -{"current_steps": 1618, "total_steps": 1630, "loss": 0.0009, "lr": 7.846907511394052e-10, "epoch": 9.92638036809816, "percentage": 99.26, "elapsed_time": "1:54:03", "remaining_time": "0:00:50"} -{"current_steps": 1619, "total_steps": 1630, "loss": 0.0012, "lr": 6.686174120990042e-10, "epoch": 9.932515337423313, "percentage": 99.33, "elapsed_time": "1:54:05", "remaining_time": "0:00:46"} -{"current_steps": 1620, "total_steps": 1630, "loss": 0.007, "lr": 5.618283533767588e-10, "epoch": 9.938650306748466, "percentage": 99.39, "elapsed_time": "1:54:12", "remaining_time": "0:00:42"} -{"current_steps": 1621, "total_steps": 1630, "loss": 0.0004, "lr": 4.6432397166285e-10, "epoch": 9.94478527607362, "percentage": 99.45, "elapsed_time": "1:54:13", "remaining_time": "0:00:38"} -{"current_steps": 1622, "total_steps": 1630, "loss": 0.0015, "lr": 3.7610462915699255e-10, "epoch": 9.950920245398773, "percentage": 99.51, "elapsed_time": "1:54:15", "remaining_time": "0:00:33"} -{"current_steps": 1623, "total_steps": 1630, "loss": 0.0007, "lr": 2.9717065356815733e-10, "epoch": 9.957055214723926, "percentage": 99.57, "elapsed_time": "1:54:19", "remaining_time": "0:00:29"} -{"current_steps": 1624, "total_steps": 1630, "loss": 0.0025, "lr": 2.2752233811262901e-10, "epoch": 9.963190184049079, "percentage": 99.63, "elapsed_time": "1:54:25", "remaining_time": "0:00:25"} -{"current_steps": 1625, "total_steps": 1630, "loss": 0.0015, "lr": 1.6715994151400572e-10, "epoch": 9.969325153374234, "percentage": 99.69, "elapsed_time": "1:54:27", "remaining_time": "0:00:21"} -{"current_steps": 1626, "total_steps": 1630, "loss": 0.001, "lr": 1.160836880001459e-10, "epoch": 9.975460122699387, "percentage": 99.75, "elapsed_time": "1:54:32", "remaining_time": "0:00:16"} -{"current_steps": 1627, "total_steps": 1630, "loss": 0.0046, "lr": 7.429376730483385e-11, "epoch": 9.98159509202454, "percentage": 99.82, "elapsed_time": "1:54:35", "remaining_time": "0:00:12"} -{"current_steps": 1628, "total_steps": 1630, "loss": 0.0006, "lr": 4.179033466500393e-11, "epoch": 9.987730061349692, "percentage": 99.88, "elapsed_time": "1:54:37", "remaining_time": "0:00:08"} -{"current_steps": 1629, "total_steps": 1630, "loss": 0.0016, "lr": 1.8573510821295882e-11, "epoch": 9.993865030674847, "percentage": 99.94, "elapsed_time": "1:54:39", "remaining_time": "0:00:04"} -{"current_steps": 1630, "total_steps": 1630, "loss": 0.0036, "lr": 4.643382017499587e-12, "epoch": 10.0, "percentage": 100.0, "elapsed_time": "1:54:41", "remaining_time": "0:00:00"} -{"current_steps": 1630, "total_steps": 1630, "epoch": 10.0, "percentage": 100.0, "elapsed_time": "1:57:50", "remaining_time": "0:00:00"} diff --git a/metallama3_8b/limo_filtered_correct/trainer_state.json b/metallama3_8b/limo_filtered_correct/trainer_state.json deleted file mode 100644 index 2ff03889d212c781cd52ba1ee1d463551d575df3..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_correct/trainer_state.json +++ /dev/null @@ -1,11453 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 10.0, - "eval_steps": 500, - "global_step": 1630, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.006134969325153374, - "grad_norm": 5.908512115478516, - "learning_rate": 5e-06, - "loss": 0.9606, - "step": 1 - }, - { - "epoch": 0.012269938650306749, - "grad_norm": 4.304474353790283, - "learning_rate": 4.999995356617983e-06, - "loss": 0.8609, - "step": 2 - }, - { - "epoch": 0.018404907975460124, - "grad_norm": 5.63697624206543, - "learning_rate": 4.999981426489179e-06, - "loss": 1.3543, - "step": 3 - }, - { - "epoch": 0.024539877300613498, - "grad_norm": 3.6674246788024902, - "learning_rate": 4.999958209665336e-06, - "loss": 0.787, - "step": 4 - }, - { - "epoch": 0.03067484662576687, - "grad_norm": 48.14854431152344, - "learning_rate": 4.999925706232695e-06, - "loss": 1.7786, - "step": 5 - }, - { - "epoch": 0.03680981595092025, - "grad_norm": 7.8689866065979, - "learning_rate": 4.999883916312e-06, - "loss": 1.2175, - "step": 6 - }, - { - "epoch": 0.04294478527607362, - "grad_norm": 5.119968891143799, - "learning_rate": 4.9998328400584864e-06, - "loss": 0.8998, - "step": 7 - }, - { - "epoch": 0.049079754601226995, - "grad_norm": 3.730757713317871, - "learning_rate": 4.999772477661888e-06, - "loss": 0.8419, - "step": 8 - }, - { - "epoch": 0.05521472392638037, - "grad_norm": 27.314565658569336, - "learning_rate": 4.999702829346432e-06, - "loss": 1.7948, - "step": 9 - }, - { - "epoch": 0.06134969325153374, - "grad_norm": 3.822697162628174, - "learning_rate": 4.999623895370843e-06, - "loss": 1.0461, - "step": 10 - }, - { - "epoch": 0.06748466257668712, - "grad_norm": 4.71220588684082, - "learning_rate": 4.999535676028338e-06, - "loss": 1.0, - "step": 11 - }, - { - "epoch": 0.0736196319018405, - "grad_norm": 3.2378087043762207, - "learning_rate": 4.999438171646624e-06, - "loss": 0.9475, - "step": 12 - }, - { - "epoch": 0.07975460122699386, - "grad_norm": 3.475543737411499, - "learning_rate": 4.999331382587901e-06, - "loss": 0.8654, - "step": 13 - }, - { - "epoch": 0.08588957055214724, - "grad_norm": 10.06365966796875, - "learning_rate": 4.999215309248861e-06, - "loss": 1.2042, - "step": 14 - }, - { - "epoch": 0.09202453987730061, - "grad_norm": 3.785153865814209, - "learning_rate": 4.999089952060681e-06, - "loss": 0.8846, - "step": 15 - }, - { - "epoch": 0.09815950920245399, - "grad_norm": 2.944488048553467, - "learning_rate": 4.998955311489025e-06, - "loss": 0.8805, - "step": 16 - }, - { - "epoch": 0.10429447852760736, - "grad_norm": 39.89304733276367, - "learning_rate": 4.998811388034046e-06, - "loss": 1.5882, - "step": 17 - }, - { - "epoch": 0.11042944785276074, - "grad_norm": 3.5883963108062744, - "learning_rate": 4.9986581822303746e-06, - "loss": 0.9222, - "step": 18 - }, - { - "epoch": 0.1165644171779141, - "grad_norm": 6.972247123718262, - "learning_rate": 4.998495694647127e-06, - "loss": 1.4088, - "step": 19 - }, - { - "epoch": 0.12269938650306748, - "grad_norm": 3.948991298675537, - "learning_rate": 4.998323925887895e-06, - "loss": 1.454, - "step": 20 - }, - { - "epoch": 0.12883435582822086, - "grad_norm": 3.8690035343170166, - "learning_rate": 4.998142876590749e-06, - "loss": 0.6335, - "step": 21 - }, - { - "epoch": 0.13496932515337423, - "grad_norm": 5.243765830993652, - "learning_rate": 4.997952547428236e-06, - "loss": 0.6725, - "step": 22 - }, - { - "epoch": 0.1411042944785276, - "grad_norm": 3.5994043350219727, - "learning_rate": 4.997752939107372e-06, - "loss": 0.7814, - "step": 23 - }, - { - "epoch": 0.147239263803681, - "grad_norm": 4.06965970993042, - "learning_rate": 4.997544052369642e-06, - "loss": 0.9683, - "step": 24 - }, - { - "epoch": 0.15337423312883436, - "grad_norm": 3.3247246742248535, - "learning_rate": 4.997325887990999e-06, - "loss": 0.9414, - "step": 25 - }, - { - "epoch": 0.15950920245398773, - "grad_norm": 5.811742782592773, - "learning_rate": 4.997098446781861e-06, - "loss": 0.8894, - "step": 26 - }, - { - "epoch": 0.1656441717791411, - "grad_norm": 2.661334753036499, - "learning_rate": 4.996861729587103e-06, - "loss": 0.7708, - "step": 27 - }, - { - "epoch": 0.17177914110429449, - "grad_norm": 2.863943576812744, - "learning_rate": 4.996615737286061e-06, - "loss": 0.6995, - "step": 28 - }, - { - "epoch": 0.17791411042944785, - "grad_norm": 20.376733779907227, - "learning_rate": 4.996360470792524e-06, - "loss": 1.2563, - "step": 29 - }, - { - "epoch": 0.18404907975460122, - "grad_norm": 3.62265682220459, - "learning_rate": 4.996095931054731e-06, - "loss": 0.7266, - "step": 30 - }, - { - "epoch": 0.1901840490797546, - "grad_norm": 3.915076732635498, - "learning_rate": 4.9958221190553705e-06, - "loss": 0.9227, - "step": 31 - }, - { - "epoch": 0.19631901840490798, - "grad_norm": 3.129855155944824, - "learning_rate": 4.995539035811572e-06, - "loss": 0.701, - "step": 32 - }, - { - "epoch": 0.20245398773006135, - "grad_norm": 2.7532224655151367, - "learning_rate": 4.9952466823749076e-06, - "loss": 0.6491, - "step": 33 - }, - { - "epoch": 0.2085889570552147, - "grad_norm": 2.8444128036499023, - "learning_rate": 4.9949450598313835e-06, - "loss": 0.8029, - "step": 34 - }, - { - "epoch": 0.2147239263803681, - "grad_norm": 2.57743239402771, - "learning_rate": 4.994634169301439e-06, - "loss": 0.8785, - "step": 35 - }, - { - "epoch": 0.22085889570552147, - "grad_norm": 3.280055284500122, - "learning_rate": 4.994314011939941e-06, - "loss": 1.034, - "step": 36 - }, - { - "epoch": 0.22699386503067484, - "grad_norm": 2.455838680267334, - "learning_rate": 4.99398458893618e-06, - "loss": 0.8557, - "step": 37 - }, - { - "epoch": 0.2331288343558282, - "grad_norm": 4.72681188583374, - "learning_rate": 4.993645901513865e-06, - "loss": 1.1904, - "step": 38 - }, - { - "epoch": 0.2392638036809816, - "grad_norm": 3.0585641860961914, - "learning_rate": 4.993297950931121e-06, - "loss": 0.7668, - "step": 39 - }, - { - "epoch": 0.24539877300613497, - "grad_norm": 2.4603540897369385, - "learning_rate": 4.9929407384804806e-06, - "loss": 0.8812, - "step": 40 - }, - { - "epoch": 0.25153374233128833, - "grad_norm": 2.9702436923980713, - "learning_rate": 4.992574265488883e-06, - "loss": 0.8878, - "step": 41 - }, - { - "epoch": 0.25766871165644173, - "grad_norm": 2.6973602771759033, - "learning_rate": 4.9921985333176694e-06, - "loss": 0.7251, - "step": 42 - }, - { - "epoch": 0.26380368098159507, - "grad_norm": 2.5542335510253906, - "learning_rate": 4.991813543362572e-06, - "loss": 0.6638, - "step": 43 - }, - { - "epoch": 0.26993865030674846, - "grad_norm": 3.7530782222747803, - "learning_rate": 4.991419297053716e-06, - "loss": 1.0725, - "step": 44 - }, - { - "epoch": 0.27607361963190186, - "grad_norm": 2.6483025550842285, - "learning_rate": 4.991015795855611e-06, - "loss": 0.7238, - "step": 45 - }, - { - "epoch": 0.2822085889570552, - "grad_norm": 3.434422492980957, - "learning_rate": 4.990603041267144e-06, - "loss": 0.9188, - "step": 46 - }, - { - "epoch": 0.2883435582822086, - "grad_norm": 2.914340019226074, - "learning_rate": 4.990181034821578e-06, - "loss": 0.6158, - "step": 47 - }, - { - "epoch": 0.294478527607362, - "grad_norm": 2.7211625576019287, - "learning_rate": 4.98974977808654e-06, - "loss": 0.7165, - "step": 48 - }, - { - "epoch": 0.3006134969325153, - "grad_norm": 2.8414249420166016, - "learning_rate": 4.989309272664026e-06, - "loss": 0.7277, - "step": 49 - }, - { - "epoch": 0.3067484662576687, - "grad_norm": 3.683204412460327, - "learning_rate": 4.988859520190381e-06, - "loss": 0.9793, - "step": 50 - }, - { - "epoch": 0.3128834355828221, - "grad_norm": 3.1732583045959473, - "learning_rate": 4.988400522336304e-06, - "loss": 0.8966, - "step": 51 - }, - { - "epoch": 0.31901840490797545, - "grad_norm": 2.7789194583892822, - "learning_rate": 4.9879322808068365e-06, - "loss": 0.8191, - "step": 52 - }, - { - "epoch": 0.32515337423312884, - "grad_norm": 2.754816770553589, - "learning_rate": 4.987454797341358e-06, - "loss": 0.6308, - "step": 53 - }, - { - "epoch": 0.3312883435582822, - "grad_norm": 2.730104684829712, - "learning_rate": 4.98696807371358e-06, - "loss": 0.8226, - "step": 54 - }, - { - "epoch": 0.3374233128834356, - "grad_norm": 3.2225449085235596, - "learning_rate": 4.986472111731536e-06, - "loss": 0.9184, - "step": 55 - }, - { - "epoch": 0.34355828220858897, - "grad_norm": 3.2684760093688965, - "learning_rate": 4.985966913237581e-06, - "loss": 0.6593, - "step": 56 - }, - { - "epoch": 0.3496932515337423, - "grad_norm": 2.43105411529541, - "learning_rate": 4.985452480108376e-06, - "loss": 0.6994, - "step": 57 - }, - { - "epoch": 0.3558282208588957, - "grad_norm": 7.366360664367676, - "learning_rate": 4.984928814254889e-06, - "loss": 1.1374, - "step": 58 - }, - { - "epoch": 0.3619631901840491, - "grad_norm": 2.81864333152771, - "learning_rate": 4.984395917622387e-06, - "loss": 0.8097, - "step": 59 - }, - { - "epoch": 0.36809815950920244, - "grad_norm": 3.1107730865478516, - "learning_rate": 4.9838537921904206e-06, - "loss": 0.8511, - "step": 60 - }, - { - "epoch": 0.37423312883435583, - "grad_norm": 2.460545301437378, - "learning_rate": 4.9833024399728295e-06, - "loss": 0.898, - "step": 61 - }, - { - "epoch": 0.3803680981595092, - "grad_norm": 2.921992778778076, - "learning_rate": 4.982741863017722e-06, - "loss": 0.6671, - "step": 62 - }, - { - "epoch": 0.38650306748466257, - "grad_norm": 3.3006443977355957, - "learning_rate": 4.982172063407479e-06, - "loss": 1.0559, - "step": 63 - }, - { - "epoch": 0.39263803680981596, - "grad_norm": 2.642587661743164, - "learning_rate": 4.9815930432587365e-06, - "loss": 0.6663, - "step": 64 - }, - { - "epoch": 0.3987730061349693, - "grad_norm": 2.905898094177246, - "learning_rate": 4.981004804722384e-06, - "loss": 0.6895, - "step": 65 - }, - { - "epoch": 0.4049079754601227, - "grad_norm": 2.9174182415008545, - "learning_rate": 4.980407349983556e-06, - "loss": 0.7982, - "step": 66 - }, - { - "epoch": 0.4110429447852761, - "grad_norm": 2.214322805404663, - "learning_rate": 4.979800681261619e-06, - "loss": 0.6808, - "step": 67 - }, - { - "epoch": 0.4171779141104294, - "grad_norm": 2.7152462005615234, - "learning_rate": 4.9791848008101705e-06, - "loss": 0.567, - "step": 68 - }, - { - "epoch": 0.4233128834355828, - "grad_norm": 2.5657734870910645, - "learning_rate": 4.978559710917024e-06, - "loss": 0.7745, - "step": 69 - }, - { - "epoch": 0.4294478527607362, - "grad_norm": 3.9103832244873047, - "learning_rate": 4.977925413904205e-06, - "loss": 0.9815, - "step": 70 - }, - { - "epoch": 0.43558282208588955, - "grad_norm": 4.610236644744873, - "learning_rate": 4.9772819121279395e-06, - "loss": 1.164, - "step": 71 - }, - { - "epoch": 0.44171779141104295, - "grad_norm": 3.01170015335083, - "learning_rate": 4.976629207978648e-06, - "loss": 0.7587, - "step": 72 - }, - { - "epoch": 0.44785276073619634, - "grad_norm": 3.175889253616333, - "learning_rate": 4.975967303880933e-06, - "loss": 0.58, - "step": 73 - }, - { - "epoch": 0.4539877300613497, - "grad_norm": 2.503741502761841, - "learning_rate": 4.975296202293575e-06, - "loss": 0.7253, - "step": 74 - }, - { - "epoch": 0.4601226993865031, - "grad_norm": 2.6778078079223633, - "learning_rate": 4.974615905709518e-06, - "loss": 0.7352, - "step": 75 - }, - { - "epoch": 0.4662576687116564, - "grad_norm": 5.950812816619873, - "learning_rate": 4.973926416655863e-06, - "loss": 1.0643, - "step": 76 - }, - { - "epoch": 0.4723926380368098, - "grad_norm": 3.0165305137634277, - "learning_rate": 4.973227737693858e-06, - "loss": 0.6699, - "step": 77 - }, - { - "epoch": 0.4785276073619632, - "grad_norm": 4.793259620666504, - "learning_rate": 4.972519871418894e-06, - "loss": 1.0315, - "step": 78 - }, - { - "epoch": 0.48466257668711654, - "grad_norm": 3.632815361022949, - "learning_rate": 4.971802820460481e-06, - "loss": 0.7003, - "step": 79 - }, - { - "epoch": 0.49079754601226994, - "grad_norm": 3.077507734298706, - "learning_rate": 4.971076587482254e-06, - "loss": 0.6776, - "step": 80 - }, - { - "epoch": 0.49693251533742333, - "grad_norm": 3.3886241912841797, - "learning_rate": 4.970341175181957e-06, - "loss": 0.7422, - "step": 81 - }, - { - "epoch": 0.5030674846625767, - "grad_norm": 2.71288800239563, - "learning_rate": 4.969596586291425e-06, - "loss": 0.7471, - "step": 82 - }, - { - "epoch": 0.50920245398773, - "grad_norm": 2.777920961380005, - "learning_rate": 4.968842823576592e-06, - "loss": 0.8111, - "step": 83 - }, - { - "epoch": 0.5153374233128835, - "grad_norm": 6.496985912322998, - "learning_rate": 4.968079889837461e-06, - "loss": 0.9965, - "step": 84 - }, - { - "epoch": 0.5214723926380368, - "grad_norm": 2.6163430213928223, - "learning_rate": 4.967307787908108e-06, - "loss": 0.6833, - "step": 85 - }, - { - "epoch": 0.5276073619631901, - "grad_norm": 3.244098663330078, - "learning_rate": 4.966526520656663e-06, - "loss": 0.8373, - "step": 86 - }, - { - "epoch": 0.5337423312883436, - "grad_norm": 2.9027860164642334, - "learning_rate": 4.965736090985305e-06, - "loss": 0.8529, - "step": 87 - }, - { - "epoch": 0.5398773006134969, - "grad_norm": 2.3786230087280273, - "learning_rate": 4.964936501830246e-06, - "loss": 0.6577, - "step": 88 - }, - { - "epoch": 0.5460122699386503, - "grad_norm": 7.3099045753479, - "learning_rate": 4.964127756161727e-06, - "loss": 1.1184, - "step": 89 - }, - { - "epoch": 0.5521472392638037, - "grad_norm": 3.068873167037964, - "learning_rate": 4.963309856983998e-06, - "loss": 0.7906, - "step": 90 - }, - { - "epoch": 0.558282208588957, - "grad_norm": 3.082547426223755, - "learning_rate": 4.9624828073353144e-06, - "loss": 0.8107, - "step": 91 - }, - { - "epoch": 0.5644171779141104, - "grad_norm": 2.4586973190307617, - "learning_rate": 4.961646610287922e-06, - "loss": 0.7421, - "step": 92 - }, - { - "epoch": 0.5705521472392638, - "grad_norm": 2.779277801513672, - "learning_rate": 4.960801268948047e-06, - "loss": 0.7134, - "step": 93 - }, - { - "epoch": 0.5766871165644172, - "grad_norm": 3.2255213260650635, - "learning_rate": 4.959946786455882e-06, - "loss": 0.5875, - "step": 94 - }, - { - "epoch": 0.5828220858895705, - "grad_norm": 2.783395528793335, - "learning_rate": 4.959083165985581e-06, - "loss": 0.6595, - "step": 95 - }, - { - "epoch": 0.588957055214724, - "grad_norm": 2.240114212036133, - "learning_rate": 4.958210410745237e-06, - "loss": 0.793, - "step": 96 - }, - { - "epoch": 0.5950920245398773, - "grad_norm": 2.9399421215057373, - "learning_rate": 4.957328523976879e-06, - "loss": 0.5896, - "step": 97 - }, - { - "epoch": 0.6012269938650306, - "grad_norm": 3.4449355602264404, - "learning_rate": 4.956437508956458e-06, - "loss": 0.8658, - "step": 98 - }, - { - "epoch": 0.6073619631901841, - "grad_norm": 4.273710250854492, - "learning_rate": 4.9555373689938325e-06, - "loss": 0.8316, - "step": 99 - }, - { - "epoch": 0.6134969325153374, - "grad_norm": 3.4222047328948975, - "learning_rate": 4.954628107432757e-06, - "loss": 1.0613, - "step": 100 - }, - { - "epoch": 0.6196319018404908, - "grad_norm": 2.5318963527679443, - "learning_rate": 4.95370972765087e-06, - "loss": 0.7194, - "step": 101 - }, - { - "epoch": 0.6257668711656442, - "grad_norm": 2.7852585315704346, - "learning_rate": 4.952782233059683e-06, - "loss": 0.5927, - "step": 102 - }, - { - "epoch": 0.6319018404907976, - "grad_norm": 2.6532323360443115, - "learning_rate": 4.951845627104565e-06, - "loss": 0.8505, - "step": 103 - }, - { - "epoch": 0.6380368098159509, - "grad_norm": 2.3213467597961426, - "learning_rate": 4.95089991326473e-06, - "loss": 0.8682, - "step": 104 - }, - { - "epoch": 0.6441717791411042, - "grad_norm": 2.607992649078369, - "learning_rate": 4.9499450950532305e-06, - "loss": 0.8735, - "step": 105 - }, - { - "epoch": 0.6503067484662577, - "grad_norm": 3.9820072650909424, - "learning_rate": 4.94898117601693e-06, - "loss": 1.0571, - "step": 106 - }, - { - "epoch": 0.656441717791411, - "grad_norm": 3.3878824710845947, - "learning_rate": 4.948008159736507e-06, - "loss": 0.7831, - "step": 107 - }, - { - "epoch": 0.6625766871165644, - "grad_norm": 2.6935670375823975, - "learning_rate": 4.94702604982643e-06, - "loss": 0.5968, - "step": 108 - }, - { - "epoch": 0.6687116564417178, - "grad_norm": 2.78190016746521, - "learning_rate": 4.9460348499349485e-06, - "loss": 0.7504, - "step": 109 - }, - { - "epoch": 0.6748466257668712, - "grad_norm": 2.973083972930908, - "learning_rate": 4.945034563744077e-06, - "loss": 0.6728, - "step": 110 - }, - { - "epoch": 0.6809815950920245, - "grad_norm": 2.631803512573242, - "learning_rate": 4.944025194969586e-06, - "loss": 0.609, - "step": 111 - }, - { - "epoch": 0.6871165644171779, - "grad_norm": 2.7443883419036865, - "learning_rate": 4.9430067473609825e-06, - "loss": 0.8713, - "step": 112 - }, - { - "epoch": 0.6932515337423313, - "grad_norm": 2.543769121170044, - "learning_rate": 4.941979224701499e-06, - "loss": 0.8035, - "step": 113 - }, - { - "epoch": 0.6993865030674846, - "grad_norm": 3.7799901962280273, - "learning_rate": 4.94094263080808e-06, - "loss": 0.9341, - "step": 114 - }, - { - "epoch": 0.7055214723926381, - "grad_norm": 3.1234734058380127, - "learning_rate": 4.939896969531367e-06, - "loss": 1.1066, - "step": 115 - }, - { - "epoch": 0.7116564417177914, - "grad_norm": 2.356036424636841, - "learning_rate": 4.938842244755683e-06, - "loss": 0.853, - "step": 116 - }, - { - "epoch": 0.7177914110429447, - "grad_norm": 3.6231274604797363, - "learning_rate": 4.937778460399022e-06, - "loss": 0.9116, - "step": 117 - }, - { - "epoch": 0.7239263803680982, - "grad_norm": 3.1277005672454834, - "learning_rate": 4.936705620413028e-06, - "loss": 0.5888, - "step": 118 - }, - { - "epoch": 0.7300613496932515, - "grad_norm": 2.7338361740112305, - "learning_rate": 4.935623728782986e-06, - "loss": 0.592, - "step": 119 - }, - { - "epoch": 0.7361963190184049, - "grad_norm": 2.748363733291626, - "learning_rate": 4.934532789527805e-06, - "loss": 0.8713, - "step": 120 - }, - { - "epoch": 0.7423312883435583, - "grad_norm": 4.460031986236572, - "learning_rate": 4.933432806700004e-06, - "loss": 0.6791, - "step": 121 - }, - { - "epoch": 0.7484662576687117, - "grad_norm": 2.392911911010742, - "learning_rate": 4.932323784385693e-06, - "loss": 0.7531, - "step": 122 - }, - { - "epoch": 0.754601226993865, - "grad_norm": 2.7804384231567383, - "learning_rate": 4.931205726704566e-06, - "loss": 0.7547, - "step": 123 - }, - { - "epoch": 0.7607361963190185, - "grad_norm": 2.7664780616760254, - "learning_rate": 4.930078637809878e-06, - "loss": 0.7849, - "step": 124 - }, - { - "epoch": 0.7668711656441718, - "grad_norm": 2.592808723449707, - "learning_rate": 4.928942521888431e-06, - "loss": 0.7015, - "step": 125 - }, - { - "epoch": 0.7730061349693251, - "grad_norm": 2.7080585956573486, - "learning_rate": 4.927797383160561e-06, - "loss": 1.0028, - "step": 126 - }, - { - "epoch": 0.7791411042944786, - "grad_norm": 2.7941503524780273, - "learning_rate": 4.926643225880123e-06, - "loss": 0.602, - "step": 127 - }, - { - "epoch": 0.7852760736196319, - "grad_norm": 3.2796623706817627, - "learning_rate": 4.925480054334471e-06, - "loss": 0.7473, - "step": 128 - }, - { - "epoch": 0.7914110429447853, - "grad_norm": 2.7623610496520996, - "learning_rate": 4.924307872844444e-06, - "loss": 1.0573, - "step": 129 - }, - { - "epoch": 0.7975460122699386, - "grad_norm": 2.6224453449249268, - "learning_rate": 4.923126685764351e-06, - "loss": 0.7399, - "step": 130 - }, - { - "epoch": 0.803680981595092, - "grad_norm": 17.736326217651367, - "learning_rate": 4.921936497481956e-06, - "loss": 0.9548, - "step": 131 - }, - { - "epoch": 0.8098159509202454, - "grad_norm": 2.504213333129883, - "learning_rate": 4.920737312418456e-06, - "loss": 0.6748, - "step": 132 - }, - { - "epoch": 0.8159509202453987, - "grad_norm": 3.617077350616455, - "learning_rate": 4.919529135028473e-06, - "loss": 0.8431, - "step": 133 - }, - { - "epoch": 0.8220858895705522, - "grad_norm": 2.6559832096099854, - "learning_rate": 4.918311969800027e-06, - "loss": 0.7243, - "step": 134 - }, - { - "epoch": 0.8282208588957055, - "grad_norm": 2.7539305686950684, - "learning_rate": 4.917085821254532e-06, - "loss": 0.7845, - "step": 135 - }, - { - "epoch": 0.8343558282208589, - "grad_norm": 3.3587615489959717, - "learning_rate": 4.915850693946766e-06, - "loss": 0.4891, - "step": 136 - }, - { - "epoch": 0.8404907975460123, - "grad_norm": 3.064354181289673, - "learning_rate": 4.914606592464865e-06, - "loss": 0.7917, - "step": 137 - }, - { - "epoch": 0.8466257668711656, - "grad_norm": 3.2505199909210205, - "learning_rate": 4.9133535214303e-06, - "loss": 0.9681, - "step": 138 - }, - { - "epoch": 0.852760736196319, - "grad_norm": 3.8027830123901367, - "learning_rate": 4.91209148549786e-06, - "loss": 0.9275, - "step": 139 - }, - { - "epoch": 0.8588957055214724, - "grad_norm": 2.4154372215270996, - "learning_rate": 4.910820489355637e-06, - "loss": 0.7259, - "step": 140 - }, - { - "epoch": 0.8650306748466258, - "grad_norm": 2.892462968826294, - "learning_rate": 4.909540537725007e-06, - "loss": 0.6061, - "step": 141 - }, - { - "epoch": 0.8711656441717791, - "grad_norm": 3.3398196697235107, - "learning_rate": 4.908251635360616e-06, - "loss": 1.0559, - "step": 142 - }, - { - "epoch": 0.8773006134969326, - "grad_norm": 3.022512197494507, - "learning_rate": 4.906953787050354e-06, - "loss": 0.7372, - "step": 143 - }, - { - "epoch": 0.8834355828220859, - "grad_norm": 2.658661365509033, - "learning_rate": 4.905646997615347e-06, - "loss": 0.6234, - "step": 144 - }, - { - "epoch": 0.8895705521472392, - "grad_norm": 3.454400062561035, - "learning_rate": 4.904331271909932e-06, - "loss": 0.8066, - "step": 145 - }, - { - "epoch": 0.8957055214723927, - "grad_norm": 3.1300277709960938, - "learning_rate": 4.903006614821645e-06, - "loss": 0.6861, - "step": 146 - }, - { - "epoch": 0.901840490797546, - "grad_norm": 2.362537145614624, - "learning_rate": 4.901673031271194e-06, - "loss": 0.6112, - "step": 147 - }, - { - "epoch": 0.9079754601226994, - "grad_norm": 3.375577688217163, - "learning_rate": 4.900330526212451e-06, - "loss": 0.6314, - "step": 148 - }, - { - "epoch": 0.9141104294478528, - "grad_norm": 2.955656051635742, - "learning_rate": 4.898979104632427e-06, - "loss": 0.889, - "step": 149 - }, - { - "epoch": 0.9202453987730062, - "grad_norm": 2.9285926818847656, - "learning_rate": 4.897618771551255e-06, - "loss": 0.6406, - "step": 150 - }, - { - "epoch": 0.9263803680981595, - "grad_norm": 2.131819725036621, - "learning_rate": 4.8962495320221714e-06, - "loss": 0.6368, - "step": 151 - }, - { - "epoch": 0.9325153374233128, - "grad_norm": 2.780649185180664, - "learning_rate": 4.8948713911315e-06, - "loss": 0.8642, - "step": 152 - }, - { - "epoch": 0.9386503067484663, - "grad_norm": 2.941500186920166, - "learning_rate": 4.8934843539986266e-06, - "loss": 0.714, - "step": 153 - }, - { - "epoch": 0.9447852760736196, - "grad_norm": 2.7729203701019287, - "learning_rate": 4.892088425775986e-06, - "loss": 0.8365, - "step": 154 - }, - { - "epoch": 0.950920245398773, - "grad_norm": 2.6887171268463135, - "learning_rate": 4.890683611649041e-06, - "loss": 0.7937, - "step": 155 - }, - { - "epoch": 0.9570552147239264, - "grad_norm": 3.7638463973999023, - "learning_rate": 4.8892699168362626e-06, - "loss": 0.7485, - "step": 156 - }, - { - "epoch": 0.9631901840490797, - "grad_norm": 2.8132755756378174, - "learning_rate": 4.887847346589111e-06, - "loss": 0.6467, - "step": 157 - }, - { - "epoch": 0.9693251533742331, - "grad_norm": 2.652247190475464, - "learning_rate": 4.886415906192015e-06, - "loss": 0.4651, - "step": 158 - }, - { - "epoch": 0.9754601226993865, - "grad_norm": 2.5854647159576416, - "learning_rate": 4.884975600962355e-06, - "loss": 0.8756, - "step": 159 - }, - { - "epoch": 0.9815950920245399, - "grad_norm": 3.1630544662475586, - "learning_rate": 4.883526436250441e-06, - "loss": 0.7339, - "step": 160 - }, - { - "epoch": 0.9877300613496932, - "grad_norm": 2.84452748298645, - "learning_rate": 4.8820684174394935e-06, - "loss": 0.7808, - "step": 161 - }, - { - "epoch": 0.9938650306748467, - "grad_norm": 3.604048490524292, - "learning_rate": 4.880601549945622e-06, - "loss": 0.96, - "step": 162 - }, - { - "epoch": 1.0, - "grad_norm": 2.302924871444702, - "learning_rate": 4.879125839217808e-06, - "loss": 0.8122, - "step": 163 - }, - { - "epoch": 1.0061349693251533, - "grad_norm": 3.1254405975341797, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.7307, - "step": 164 - }, - { - "epoch": 1.0122699386503067, - "grad_norm": 2.745603322982788, - "learning_rate": 4.8761479100205085e-06, - "loss": 0.7554, - "step": 165 - }, - { - "epoch": 1.01840490797546, - "grad_norm": 2.494840145111084, - "learning_rate": 4.874645702613152e-06, - "loss": 0.4372, - "step": 166 - }, - { - "epoch": 1.0245398773006136, - "grad_norm": 2.3526735305786133, - "learning_rate": 4.873134674096072e-06, - "loss": 0.3597, - "step": 167 - }, - { - "epoch": 1.030674846625767, - "grad_norm": 2.945887804031372, - "learning_rate": 4.871614830082297e-06, - "loss": 0.5854, - "step": 168 - }, - { - "epoch": 1.0368098159509203, - "grad_norm": 3.5723934173583984, - "learning_rate": 4.870086176217597e-06, - "loss": 0.7978, - "step": 169 - }, - { - "epoch": 1.0429447852760736, - "grad_norm": 3.2997145652770996, - "learning_rate": 4.868548718180473e-06, - "loss": 0.5593, - "step": 170 - }, - { - "epoch": 1.049079754601227, - "grad_norm": 3.4120635986328125, - "learning_rate": 4.867002461682129e-06, - "loss": 0.4083, - "step": 171 - }, - { - "epoch": 1.0552147239263803, - "grad_norm": 2.697617292404175, - "learning_rate": 4.8654474124664505e-06, - "loss": 0.4752, - "step": 172 - }, - { - "epoch": 1.0613496932515338, - "grad_norm": 5.082247734069824, - "learning_rate": 4.863883576309991e-06, - "loss": 0.7435, - "step": 173 - }, - { - "epoch": 1.0674846625766872, - "grad_norm": 2.773864984512329, - "learning_rate": 4.8623109590219395e-06, - "loss": 0.4612, - "step": 174 - }, - { - "epoch": 1.0736196319018405, - "grad_norm": 3.429703712463379, - "learning_rate": 4.860729566444106e-06, - "loss": 0.4644, - "step": 175 - }, - { - "epoch": 1.0797546012269938, - "grad_norm": 2.997938394546509, - "learning_rate": 4.8591394044508985e-06, - "loss": 0.4852, - "step": 176 - }, - { - "epoch": 1.0858895705521472, - "grad_norm": 2.549513339996338, - "learning_rate": 4.857540478949302e-06, - "loss": 0.4574, - "step": 177 - }, - { - "epoch": 1.0920245398773005, - "grad_norm": 3.459400177001953, - "learning_rate": 4.855932795878852e-06, - "loss": 0.8095, - "step": 178 - }, - { - "epoch": 1.098159509202454, - "grad_norm": 2.8103644847869873, - "learning_rate": 4.854316361211619e-06, - "loss": 0.4578, - "step": 179 - }, - { - "epoch": 1.1042944785276074, - "grad_norm": 2.631221055984497, - "learning_rate": 4.852691180952183e-06, - "loss": 0.5473, - "step": 180 - }, - { - "epoch": 1.1104294478527608, - "grad_norm": 3.189946174621582, - "learning_rate": 4.851057261137608e-06, - "loss": 0.4313, - "step": 181 - }, - { - "epoch": 1.116564417177914, - "grad_norm": 2.891418933868408, - "learning_rate": 4.8494146078374274e-06, - "loss": 0.4197, - "step": 182 - }, - { - "epoch": 1.1226993865030674, - "grad_norm": 3.239637613296509, - "learning_rate": 4.847763227153612e-06, - "loss": 0.5865, - "step": 183 - }, - { - "epoch": 1.1288343558282208, - "grad_norm": 2.484644651412964, - "learning_rate": 4.846103125220557e-06, - "loss": 0.3866, - "step": 184 - }, - { - "epoch": 1.1349693251533743, - "grad_norm": 3.1045992374420166, - "learning_rate": 4.844434308205052e-06, - "loss": 0.5357, - "step": 185 - }, - { - "epoch": 1.1411042944785277, - "grad_norm": 2.648472309112549, - "learning_rate": 4.842756782306261e-06, - "loss": 0.4783, - "step": 186 - }, - { - "epoch": 1.147239263803681, - "grad_norm": 2.5685644149780273, - "learning_rate": 4.841070553755697e-06, - "loss": 0.3733, - "step": 187 - }, - { - "epoch": 1.1533742331288344, - "grad_norm": 3.7727200984954834, - "learning_rate": 4.839375628817205e-06, - "loss": 0.6039, - "step": 188 - }, - { - "epoch": 1.1595092024539877, - "grad_norm": 2.8237369060516357, - "learning_rate": 4.837672013786931e-06, - "loss": 0.5372, - "step": 189 - }, - { - "epoch": 1.165644171779141, - "grad_norm": 3.0312252044677734, - "learning_rate": 4.835959714993305e-06, - "loss": 0.5162, - "step": 190 - }, - { - "epoch": 1.1717791411042944, - "grad_norm": 2.821498394012451, - "learning_rate": 4.8342387387970105e-06, - "loss": 0.4537, - "step": 191 - }, - { - "epoch": 1.177914110429448, - "grad_norm": 2.7834129333496094, - "learning_rate": 4.832509091590968e-06, - "loss": 0.6165, - "step": 192 - }, - { - "epoch": 1.1840490797546013, - "grad_norm": 2.9274091720581055, - "learning_rate": 4.830770779800309e-06, - "loss": 0.7475, - "step": 193 - }, - { - "epoch": 1.1901840490797546, - "grad_norm": 2.813945770263672, - "learning_rate": 4.829023809882349e-06, - "loss": 0.4629, - "step": 194 - }, - { - "epoch": 1.196319018404908, - "grad_norm": 2.27876877784729, - "learning_rate": 4.827268188326567e-06, - "loss": 0.5208, - "step": 195 - }, - { - "epoch": 1.2024539877300613, - "grad_norm": 2.8444204330444336, - "learning_rate": 4.825503921654582e-06, - "loss": 0.6521, - "step": 196 - }, - { - "epoch": 1.2085889570552146, - "grad_norm": 3.3730578422546387, - "learning_rate": 4.823731016420122e-06, - "loss": 0.7491, - "step": 197 - }, - { - "epoch": 1.2147239263803682, - "grad_norm": 2.9717822074890137, - "learning_rate": 4.821949479209011e-06, - "loss": 0.3866, - "step": 198 - }, - { - "epoch": 1.2208588957055215, - "grad_norm": 2.6570653915405273, - "learning_rate": 4.820159316639133e-06, - "loss": 0.499, - "step": 199 - }, - { - "epoch": 1.2269938650306749, - "grad_norm": 2.819960117340088, - "learning_rate": 4.818360535360418e-06, - "loss": 0.556, - "step": 200 - }, - { - "epoch": 1.2331288343558282, - "grad_norm": 2.7912111282348633, - "learning_rate": 4.816553142054806e-06, - "loss": 0.3433, - "step": 201 - }, - { - "epoch": 1.2392638036809815, - "grad_norm": 2.6427981853485107, - "learning_rate": 4.814737143436232e-06, - "loss": 0.8808, - "step": 202 - }, - { - "epoch": 1.2453987730061349, - "grad_norm": 2.5917580127716064, - "learning_rate": 4.812912546250595e-06, - "loss": 0.5718, - "step": 203 - }, - { - "epoch": 1.2515337423312882, - "grad_norm": 3.770759344100952, - "learning_rate": 4.81107935727574e-06, - "loss": 0.9743, - "step": 204 - }, - { - "epoch": 1.2576687116564418, - "grad_norm": 2.558248996734619, - "learning_rate": 4.809237583321421e-06, - "loss": 0.2821, - "step": 205 - }, - { - "epoch": 1.2638036809815951, - "grad_norm": 2.692087173461914, - "learning_rate": 4.807387231229287e-06, - "loss": 0.7524, - "step": 206 - }, - { - "epoch": 1.2699386503067485, - "grad_norm": 2.661738157272339, - "learning_rate": 4.8055283078728525e-06, - "loss": 0.4304, - "step": 207 - }, - { - "epoch": 1.2760736196319018, - "grad_norm": 2.9232122898101807, - "learning_rate": 4.803660820157468e-06, - "loss": 0.6986, - "step": 208 - }, - { - "epoch": 1.2822085889570551, - "grad_norm": 2.665097951889038, - "learning_rate": 4.801784775020303e-06, - "loss": 0.7112, - "step": 209 - }, - { - "epoch": 1.2883435582822087, - "grad_norm": 2.4504497051239014, - "learning_rate": 4.799900179430312e-06, - "loss": 0.4125, - "step": 210 - }, - { - "epoch": 1.294478527607362, - "grad_norm": 3.076204538345337, - "learning_rate": 4.798007040388212e-06, - "loss": 0.7057, - "step": 211 - }, - { - "epoch": 1.3006134969325154, - "grad_norm": 2.406977653503418, - "learning_rate": 4.7961053649264585e-06, - "loss": 0.708, - "step": 212 - }, - { - "epoch": 1.3067484662576687, - "grad_norm": 2.6545324325561523, - "learning_rate": 4.794195160109215e-06, - "loss": 0.7608, - "step": 213 - }, - { - "epoch": 1.312883435582822, - "grad_norm": 4.3817033767700195, - "learning_rate": 4.7922764330323315e-06, - "loss": 0.4779, - "step": 214 - }, - { - "epoch": 1.3190184049079754, - "grad_norm": 3.534566879272461, - "learning_rate": 4.790349190823313e-06, - "loss": 0.5464, - "step": 215 - }, - { - "epoch": 1.3251533742331287, - "grad_norm": 3.0323140621185303, - "learning_rate": 4.788413440641297e-06, - "loss": 0.6198, - "step": 216 - }, - { - "epoch": 1.331288343558282, - "grad_norm": 2.612746238708496, - "learning_rate": 4.786469189677026e-06, - "loss": 0.6695, - "step": 217 - }, - { - "epoch": 1.3374233128834356, - "grad_norm": 3.0299434661865234, - "learning_rate": 4.784516445152821e-06, - "loss": 0.4902, - "step": 218 - }, - { - "epoch": 1.343558282208589, - "grad_norm": 3.4521942138671875, - "learning_rate": 4.78255521432255e-06, - "loss": 0.7411, - "step": 219 - }, - { - "epoch": 1.3496932515337423, - "grad_norm": 2.6712653636932373, - "learning_rate": 4.780585504471612e-06, - "loss": 0.8767, - "step": 220 - }, - { - "epoch": 1.3558282208588956, - "grad_norm": 2.5099475383758545, - "learning_rate": 4.778607322916896e-06, - "loss": 0.4266, - "step": 221 - }, - { - "epoch": 1.3619631901840492, - "grad_norm": 2.641799211502075, - "learning_rate": 4.776620677006766e-06, - "loss": 0.4982, - "step": 222 - }, - { - "epoch": 1.3680981595092025, - "grad_norm": 3.1119771003723145, - "learning_rate": 4.7746255741210256e-06, - "loss": 0.6012, - "step": 223 - }, - { - "epoch": 1.3742331288343559, - "grad_norm": 3.9957170486450195, - "learning_rate": 4.772622021670897e-06, - "loss": 0.7585, - "step": 224 - }, - { - "epoch": 1.3803680981595092, - "grad_norm": 3.1070823669433594, - "learning_rate": 4.770610027098983e-06, - "loss": 0.5266, - "step": 225 - }, - { - "epoch": 1.3865030674846626, - "grad_norm": 2.7630460262298584, - "learning_rate": 4.7685895978792564e-06, - "loss": 0.6261, - "step": 226 - }, - { - "epoch": 1.392638036809816, - "grad_norm": 2.6509556770324707, - "learning_rate": 4.766560741517014e-06, - "loss": 0.7081, - "step": 227 - }, - { - "epoch": 1.3987730061349692, - "grad_norm": 3.0212976932525635, - "learning_rate": 4.76452346554886e-06, - "loss": 0.5041, - "step": 228 - }, - { - "epoch": 1.4049079754601226, - "grad_norm": 3.0454728603363037, - "learning_rate": 4.762477777542676e-06, - "loss": 0.49, - "step": 229 - }, - { - "epoch": 1.4110429447852761, - "grad_norm": 3.4296791553497314, - "learning_rate": 4.7604236850975905e-06, - "loss": 0.7056, - "step": 230 - }, - { - "epoch": 1.4171779141104295, - "grad_norm": 4.1885600090026855, - "learning_rate": 4.7583611958439514e-06, - "loss": 0.7762, - "step": 231 - }, - { - "epoch": 1.4233128834355828, - "grad_norm": 3.065854072570801, - "learning_rate": 4.7562903174433e-06, - "loss": 0.5347, - "step": 232 - }, - { - "epoch": 1.4294478527607362, - "grad_norm": 2.793851852416992, - "learning_rate": 4.75421105758834e-06, - "loss": 0.503, - "step": 233 - }, - { - "epoch": 1.4355828220858895, - "grad_norm": 3.123730421066284, - "learning_rate": 4.752123424002908e-06, - "loss": 0.5081, - "step": 234 - }, - { - "epoch": 1.441717791411043, - "grad_norm": 3.230161666870117, - "learning_rate": 4.750027424441949e-06, - "loss": 0.7523, - "step": 235 - }, - { - "epoch": 1.4478527607361964, - "grad_norm": 2.4970247745513916, - "learning_rate": 4.747923066691487e-06, - "loss": 0.5575, - "step": 236 - }, - { - "epoch": 1.4539877300613497, - "grad_norm": 2.9880685806274414, - "learning_rate": 4.745810358568588e-06, - "loss": 0.7264, - "step": 237 - }, - { - "epoch": 1.460122699386503, - "grad_norm": 2.555328369140625, - "learning_rate": 4.743689307921342e-06, - "loss": 0.4545, - "step": 238 - }, - { - "epoch": 1.4662576687116564, - "grad_norm": 3.144932746887207, - "learning_rate": 4.741559922628828e-06, - "loss": 0.5429, - "step": 239 - }, - { - "epoch": 1.4723926380368098, - "grad_norm": 3.059807062149048, - "learning_rate": 4.739422210601085e-06, - "loss": 0.5086, - "step": 240 - }, - { - "epoch": 1.478527607361963, - "grad_norm": 3.374303102493286, - "learning_rate": 4.7372761797790836e-06, - "loss": 0.6109, - "step": 241 - }, - { - "epoch": 1.4846625766871164, - "grad_norm": 2.4506947994232178, - "learning_rate": 4.735121838134697e-06, - "loss": 0.4317, - "step": 242 - }, - { - "epoch": 1.49079754601227, - "grad_norm": 2.9039974212646484, - "learning_rate": 4.732959193670672e-06, - "loss": 0.6414, - "step": 243 - }, - { - "epoch": 1.4969325153374233, - "grad_norm": 2.9412453174591064, - "learning_rate": 4.730788254420593e-06, - "loss": 0.5166, - "step": 244 - }, - { - "epoch": 1.5030674846625767, - "grad_norm": 2.500716209411621, - "learning_rate": 4.728609028448862e-06, - "loss": 0.4982, - "step": 245 - }, - { - "epoch": 1.50920245398773, - "grad_norm": 2.4233803749084473, - "learning_rate": 4.726421523850662e-06, - "loss": 0.7552, - "step": 246 - }, - { - "epoch": 1.5153374233128836, - "grad_norm": 2.357003688812256, - "learning_rate": 4.7242257487519275e-06, - "loss": 0.4365, - "step": 247 - }, - { - "epoch": 1.521472392638037, - "grad_norm": 2.6406495571136475, - "learning_rate": 4.722021711309317e-06, - "loss": 0.6002, - "step": 248 - }, - { - "epoch": 1.5276073619631902, - "grad_norm": 2.736884832382202, - "learning_rate": 4.7198094197101826e-06, - "loss": 0.4993, - "step": 249 - }, - { - "epoch": 1.5337423312883436, - "grad_norm": 3.5238845348358154, - "learning_rate": 4.7175888821725335e-06, - "loss": 0.4637, - "step": 250 - }, - { - "epoch": 1.539877300613497, - "grad_norm": 3.3783695697784424, - "learning_rate": 4.715360106945015e-06, - "loss": 0.9711, - "step": 251 - }, - { - "epoch": 1.5460122699386503, - "grad_norm": 2.9685862064361572, - "learning_rate": 4.713123102306869e-06, - "loss": 0.5452, - "step": 252 - }, - { - "epoch": 1.5521472392638036, - "grad_norm": 3.143733263015747, - "learning_rate": 4.710877876567912e-06, - "loss": 0.5034, - "step": 253 - }, - { - "epoch": 1.558282208588957, - "grad_norm": 2.8005623817443848, - "learning_rate": 4.708624438068494e-06, - "loss": 0.4236, - "step": 254 - }, - { - "epoch": 1.5644171779141103, - "grad_norm": 2.66581130027771, - "learning_rate": 4.706362795179476e-06, - "loss": 0.6095, - "step": 255 - }, - { - "epoch": 1.5705521472392638, - "grad_norm": 4.598043441772461, - "learning_rate": 4.7040929563021975e-06, - "loss": 0.738, - "step": 256 - }, - { - "epoch": 1.5766871165644172, - "grad_norm": 3.5643506050109863, - "learning_rate": 4.70181492986844e-06, - "loss": 0.6726, - "step": 257 - }, - { - "epoch": 1.5828220858895705, - "grad_norm": 2.865339994430542, - "learning_rate": 4.699528724340401e-06, - "loss": 0.4862, - "step": 258 - }, - { - "epoch": 1.588957055214724, - "grad_norm": 2.95529842376709, - "learning_rate": 4.6972343482106615e-06, - "loss": 0.5003, - "step": 259 - }, - { - "epoch": 1.5950920245398774, - "grad_norm": 2.45206356048584, - "learning_rate": 4.6949318100021546e-06, - "loss": 0.6734, - "step": 260 - }, - { - "epoch": 1.6012269938650308, - "grad_norm": 2.6789939403533936, - "learning_rate": 4.6926211182681295e-06, - "loss": 0.5639, - "step": 261 - }, - { - "epoch": 1.607361963190184, - "grad_norm": 3.307732582092285, - "learning_rate": 4.690302281592128e-06, - "loss": 0.7032, - "step": 262 - }, - { - "epoch": 1.6134969325153374, - "grad_norm": 2.8950445652008057, - "learning_rate": 4.687975308587944e-06, - "loss": 0.4937, - "step": 263 - }, - { - "epoch": 1.6196319018404908, - "grad_norm": 2.969377040863037, - "learning_rate": 4.685640207899598e-06, - "loss": 0.5829, - "step": 264 - }, - { - "epoch": 1.6257668711656441, - "grad_norm": 3.106433391571045, - "learning_rate": 4.683296988201301e-06, - "loss": 0.3805, - "step": 265 - }, - { - "epoch": 1.6319018404907975, - "grad_norm": 3.5599050521850586, - "learning_rate": 4.680945658197425e-06, - "loss": 0.7939, - "step": 266 - }, - { - "epoch": 1.6380368098159508, - "grad_norm": 5.008603096008301, - "learning_rate": 4.6785862266224695e-06, - "loss": 0.7511, - "step": 267 - }, - { - "epoch": 1.6441717791411041, - "grad_norm": 3.1393773555755615, - "learning_rate": 4.676218702241026e-06, - "loss": 0.8984, - "step": 268 - }, - { - "epoch": 1.6503067484662577, - "grad_norm": 3.0241408348083496, - "learning_rate": 4.673843093847753e-06, - "loss": 0.5473, - "step": 269 - }, - { - "epoch": 1.656441717791411, - "grad_norm": 2.9029417037963867, - "learning_rate": 4.6714594102673355e-06, - "loss": 0.6626, - "step": 270 - }, - { - "epoch": 1.6625766871165644, - "grad_norm": 3.4709246158599854, - "learning_rate": 4.669067660354456e-06, - "loss": 0.5015, - "step": 271 - }, - { - "epoch": 1.668711656441718, - "grad_norm": 2.988635778427124, - "learning_rate": 4.666667852993761e-06, - "loss": 0.5384, - "step": 272 - }, - { - "epoch": 1.6748466257668713, - "grad_norm": 3.418140411376953, - "learning_rate": 4.664259997099829e-06, - "loss": 0.7491, - "step": 273 - }, - { - "epoch": 1.6809815950920246, - "grad_norm": 2.592416763305664, - "learning_rate": 4.661844101617135e-06, - "loss": 0.6451, - "step": 274 - }, - { - "epoch": 1.687116564417178, - "grad_norm": 3.1174306869506836, - "learning_rate": 4.6594201755200205e-06, - "loss": 0.6299, - "step": 275 - }, - { - "epoch": 1.6932515337423313, - "grad_norm": 2.6569998264312744, - "learning_rate": 4.656988227812658e-06, - "loss": 0.4477, - "step": 276 - }, - { - "epoch": 1.6993865030674846, - "grad_norm": 3.5733959674835205, - "learning_rate": 4.654548267529015e-06, - "loss": 0.5473, - "step": 277 - }, - { - "epoch": 1.705521472392638, - "grad_norm": 2.7240824699401855, - "learning_rate": 4.652100303732827e-06, - "loss": 0.496, - "step": 278 - }, - { - "epoch": 1.7116564417177913, - "grad_norm": 4.1965460777282715, - "learning_rate": 4.64964434551756e-06, - "loss": 0.932, - "step": 279 - }, - { - "epoch": 1.7177914110429446, - "grad_norm": 2.3237173557281494, - "learning_rate": 4.647180402006372e-06, - "loss": 0.4648, - "step": 280 - }, - { - "epoch": 1.7239263803680982, - "grad_norm": 3.395045042037964, - "learning_rate": 4.644708482352093e-06, - "loss": 0.7237, - "step": 281 - }, - { - "epoch": 1.7300613496932515, - "grad_norm": 3.238593816757202, - "learning_rate": 4.6422285957371735e-06, - "loss": 0.5531, - "step": 282 - }, - { - "epoch": 1.7361963190184049, - "grad_norm": 3.9651403427124023, - "learning_rate": 4.639740751373663e-06, - "loss": 0.6706, - "step": 283 - }, - { - "epoch": 1.7423312883435584, - "grad_norm": 3.0042061805725098, - "learning_rate": 4.63724495850317e-06, - "loss": 0.56, - "step": 284 - }, - { - "epoch": 1.7484662576687118, - "grad_norm": 3.094310760498047, - "learning_rate": 4.634741226396832e-06, - "loss": 0.6138, - "step": 285 - }, - { - "epoch": 1.7546012269938651, - "grad_norm": 2.838168144226074, - "learning_rate": 4.632229564355275e-06, - "loss": 0.4908, - "step": 286 - }, - { - "epoch": 1.7607361963190185, - "grad_norm": 3.3452796936035156, - "learning_rate": 4.629709981708586e-06, - "loss": 0.8181, - "step": 287 - }, - { - "epoch": 1.7668711656441718, - "grad_norm": 2.6630783081054688, - "learning_rate": 4.6271824878162704e-06, - "loss": 0.5625, - "step": 288 - }, - { - "epoch": 1.7730061349693251, - "grad_norm": 2.583650588989258, - "learning_rate": 4.624647092067226e-06, - "loss": 0.3416, - "step": 289 - }, - { - "epoch": 1.7791411042944785, - "grad_norm": 2.73132586479187, - "learning_rate": 4.622103803879702e-06, - "loss": 0.3889, - "step": 290 - }, - { - "epoch": 1.7852760736196318, - "grad_norm": 4.1010260581970215, - "learning_rate": 4.619552632701263e-06, - "loss": 0.611, - "step": 291 - }, - { - "epoch": 1.7914110429447851, - "grad_norm": 4.53068208694458, - "learning_rate": 4.61699358800876e-06, - "loss": 0.7219, - "step": 292 - }, - { - "epoch": 1.7975460122699385, - "grad_norm": 3.4877254962921143, - "learning_rate": 4.614426679308291e-06, - "loss": 0.6402, - "step": 293 - }, - { - "epoch": 1.803680981595092, - "grad_norm": 2.9445226192474365, - "learning_rate": 4.611851916135166e-06, - "loss": 0.509, - "step": 294 - }, - { - "epoch": 1.8098159509202454, - "grad_norm": 2.6622228622436523, - "learning_rate": 4.609269308053872e-06, - "loss": 0.6167, - "step": 295 - }, - { - "epoch": 1.8159509202453987, - "grad_norm": 3.131530523300171, - "learning_rate": 4.606678864658039e-06, - "loss": 0.8039, - "step": 296 - }, - { - "epoch": 1.8220858895705523, - "grad_norm": 3.212188482284546, - "learning_rate": 4.604080595570399e-06, - "loss": 0.5754, - "step": 297 - }, - { - "epoch": 1.8282208588957056, - "grad_norm": 3.522850275039673, - "learning_rate": 4.601474510442759e-06, - "loss": 0.4432, - "step": 298 - }, - { - "epoch": 1.834355828220859, - "grad_norm": 2.5877151489257812, - "learning_rate": 4.598860618955957e-06, - "loss": 0.6541, - "step": 299 - }, - { - "epoch": 1.8404907975460123, - "grad_norm": 2.803833484649658, - "learning_rate": 4.596238930819832e-06, - "loss": 0.5824, - "step": 300 - }, - { - "epoch": 1.8466257668711656, - "grad_norm": 2.7125494480133057, - "learning_rate": 4.5936094557731815e-06, - "loss": 0.6976, - "step": 301 - }, - { - "epoch": 1.852760736196319, - "grad_norm": 3.6549370288848877, - "learning_rate": 4.590972203583732e-06, - "loss": 0.7105, - "step": 302 - }, - { - "epoch": 1.8588957055214723, - "grad_norm": 3.3241944313049316, - "learning_rate": 4.588327184048099e-06, - "loss": 0.7446, - "step": 303 - }, - { - "epoch": 1.8650306748466257, - "grad_norm": 2.8388822078704834, - "learning_rate": 4.585674406991752e-06, - "loss": 0.4926, - "step": 304 - }, - { - "epoch": 1.871165644171779, - "grad_norm": 2.9760420322418213, - "learning_rate": 4.5830138822689755e-06, - "loss": 0.7368, - "step": 305 - }, - { - "epoch": 1.8773006134969326, - "grad_norm": 2.5437633991241455, - "learning_rate": 4.5803456197628374e-06, - "loss": 0.4678, - "step": 306 - }, - { - "epoch": 1.883435582822086, - "grad_norm": 3.0044775009155273, - "learning_rate": 4.577669629385145e-06, - "loss": 0.4241, - "step": 307 - }, - { - "epoch": 1.8895705521472392, - "grad_norm": 2.6150901317596436, - "learning_rate": 4.574985921076418e-06, - "loss": 0.5327, - "step": 308 - }, - { - "epoch": 1.8957055214723928, - "grad_norm": 2.4425182342529297, - "learning_rate": 4.572294504805841e-06, - "loss": 0.7504, - "step": 309 - }, - { - "epoch": 1.9018404907975461, - "grad_norm": 2.9920194149017334, - "learning_rate": 4.569595390571232e-06, - "loss": 0.5194, - "step": 310 - }, - { - "epoch": 1.9079754601226995, - "grad_norm": 2.701087713241577, - "learning_rate": 4.566888588399007e-06, - "loss": 0.6862, - "step": 311 - }, - { - "epoch": 1.9141104294478528, - "grad_norm": 7.628893852233887, - "learning_rate": 4.564174108344139e-06, - "loss": 0.6867, - "step": 312 - }, - { - "epoch": 1.9202453987730062, - "grad_norm": 2.712947130203247, - "learning_rate": 4.561451960490123e-06, - "loss": 0.6942, - "step": 313 - }, - { - "epoch": 1.9263803680981595, - "grad_norm": 3.0063202381134033, - "learning_rate": 4.558722154948937e-06, - "loss": 0.6346, - "step": 314 - }, - { - "epoch": 1.9325153374233128, - "grad_norm": 2.957218647003174, - "learning_rate": 4.5559847018610034e-06, - "loss": 0.464, - "step": 315 - }, - { - "epoch": 1.9386503067484662, - "grad_norm": 3.322282552719116, - "learning_rate": 4.553239611395156e-06, - "loss": 0.6334, - "step": 316 - }, - { - "epoch": 1.9447852760736195, - "grad_norm": 3.0638647079467773, - "learning_rate": 4.550486893748596e-06, - "loss": 0.4227, - "step": 317 - }, - { - "epoch": 1.9509202453987728, - "grad_norm": 3.079087257385254, - "learning_rate": 4.547726559146862e-06, - "loss": 0.3719, - "step": 318 - }, - { - "epoch": 1.9570552147239264, - "grad_norm": 2.409914255142212, - "learning_rate": 4.544958617843782e-06, - "loss": 0.3331, - "step": 319 - }, - { - "epoch": 1.9631901840490797, - "grad_norm": 3.3441262245178223, - "learning_rate": 4.542183080121444e-06, - "loss": 0.6931, - "step": 320 - }, - { - "epoch": 1.969325153374233, - "grad_norm": 2.6624436378479004, - "learning_rate": 4.539399956290152e-06, - "loss": 0.6578, - "step": 321 - }, - { - "epoch": 1.9754601226993866, - "grad_norm": 3.463789224624634, - "learning_rate": 4.536609256688396e-06, - "loss": 0.5748, - "step": 322 - }, - { - "epoch": 1.98159509202454, - "grad_norm": 3.6827807426452637, - "learning_rate": 4.533810991682799e-06, - "loss": 0.5249, - "step": 323 - }, - { - "epoch": 1.9877300613496933, - "grad_norm": 4.125547409057617, - "learning_rate": 4.531005171668093e-06, - "loss": 0.3065, - "step": 324 - }, - { - "epoch": 1.9938650306748467, - "grad_norm": 2.935978412628174, - "learning_rate": 4.528191807067074e-06, - "loss": 0.5523, - "step": 325 - }, - { - "epoch": 2.0, - "grad_norm": 2.654388427734375, - "learning_rate": 4.525370908330564e-06, - "loss": 0.4157, - "step": 326 - }, - { - "epoch": 2.0061349693251533, - "grad_norm": 3.213925838470459, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4243, - "step": 327 - }, - { - "epoch": 2.0122699386503067, - "grad_norm": 3.5483286380767822, - "learning_rate": 4.519706550394248e-06, - "loss": 0.4137, - "step": 328 - }, - { - "epoch": 2.01840490797546, - "grad_norm": 3.32084059715271, - "learning_rate": 4.516863112235864e-06, - "loss": 0.5389, - "step": 329 - }, - { - "epoch": 2.0245398773006134, - "grad_norm": 3.427666425704956, - "learning_rate": 4.514012182024756e-06, - "loss": 0.285, - "step": 330 - }, - { - "epoch": 2.0306748466257667, - "grad_norm": 3.3269975185394287, - "learning_rate": 4.511153770351288e-06, - "loss": 0.4877, - "step": 331 - }, - { - "epoch": 2.03680981595092, - "grad_norm": 5.258850574493408, - "learning_rate": 4.508287887833619e-06, - "loss": 0.5168, - "step": 332 - }, - { - "epoch": 2.042944785276074, - "grad_norm": 4.316092491149902, - "learning_rate": 4.505414545117658e-06, - "loss": 0.4791, - "step": 333 - }, - { - "epoch": 2.049079754601227, - "grad_norm": 3.952056884765625, - "learning_rate": 4.502533752877028e-06, - "loss": 0.3014, - "step": 334 - }, - { - "epoch": 2.0552147239263805, - "grad_norm": 4.0617194175720215, - "learning_rate": 4.499645521813024e-06, - "loss": 0.4313, - "step": 335 - }, - { - "epoch": 2.061349693251534, - "grad_norm": 3.7869274616241455, - "learning_rate": 4.496749862654574e-06, - "loss": 0.4807, - "step": 336 - }, - { - "epoch": 2.067484662576687, - "grad_norm": 3.8181991577148438, - "learning_rate": 4.4938467861582e-06, - "loss": 0.4002, - "step": 337 - }, - { - "epoch": 2.0736196319018405, - "grad_norm": 3.8289854526519775, - "learning_rate": 4.490936303107975e-06, - "loss": 0.618, - "step": 338 - }, - { - "epoch": 2.079754601226994, - "grad_norm": 3.121443271636963, - "learning_rate": 4.488018424315488e-06, - "loss": 0.4203, - "step": 339 - }, - { - "epoch": 2.085889570552147, - "grad_norm": 3.141782283782959, - "learning_rate": 4.4850931606198e-06, - "loss": 0.3618, - "step": 340 - }, - { - "epoch": 2.0920245398773005, - "grad_norm": 3.1279287338256836, - "learning_rate": 4.482160522887404e-06, - "loss": 0.4571, - "step": 341 - }, - { - "epoch": 2.098159509202454, - "grad_norm": 3.2418482303619385, - "learning_rate": 4.479220522012185e-06, - "loss": 0.2674, - "step": 342 - }, - { - "epoch": 2.104294478527607, - "grad_norm": 10.230683326721191, - "learning_rate": 4.476273168915382e-06, - "loss": 0.5479, - "step": 343 - }, - { - "epoch": 2.1104294478527605, - "grad_norm": 3.588361978530884, - "learning_rate": 4.473318474545544e-06, - "loss": 0.3654, - "step": 344 - }, - { - "epoch": 2.116564417177914, - "grad_norm": 3.0913164615631104, - "learning_rate": 4.470356449878489e-06, - "loss": 0.2704, - "step": 345 - }, - { - "epoch": 2.1226993865030677, - "grad_norm": 3.972447633743286, - "learning_rate": 4.467387105917269e-06, - "loss": 0.3029, - "step": 346 - }, - { - "epoch": 2.128834355828221, - "grad_norm": 3.7174713611602783, - "learning_rate": 4.464410453692122e-06, - "loss": 0.6536, - "step": 347 - }, - { - "epoch": 2.1349693251533743, - "grad_norm": 3.9333994388580322, - "learning_rate": 4.461426504260434e-06, - "loss": 0.3806, - "step": 348 - }, - { - "epoch": 2.1411042944785277, - "grad_norm": 4.752816200256348, - "learning_rate": 4.458435268706699e-06, - "loss": 0.4019, - "step": 349 - }, - { - "epoch": 2.147239263803681, - "grad_norm": 2.505603790283203, - "learning_rate": 4.455436758142477e-06, - "loss": 0.2348, - "step": 350 - }, - { - "epoch": 2.1533742331288344, - "grad_norm": 3.3050570487976074, - "learning_rate": 4.452430983706351e-06, - "loss": 0.505, - "step": 351 - }, - { - "epoch": 2.1595092024539877, - "grad_norm": 5.387442588806152, - "learning_rate": 4.44941795656389e-06, - "loss": 0.399, - "step": 352 - }, - { - "epoch": 2.165644171779141, - "grad_norm": 3.4759480953216553, - "learning_rate": 4.446397687907601e-06, - "loss": 0.5664, - "step": 353 - }, - { - "epoch": 2.1717791411042944, - "grad_norm": 2.949445962905884, - "learning_rate": 4.4433701889568935e-06, - "loss": 0.2128, - "step": 354 - }, - { - "epoch": 2.1779141104294477, - "grad_norm": 3.2884252071380615, - "learning_rate": 4.440335470958035e-06, - "loss": 0.3138, - "step": 355 - }, - { - "epoch": 2.184049079754601, - "grad_norm": 3.1605632305145264, - "learning_rate": 4.437293545184111e-06, - "loss": 0.349, - "step": 356 - }, - { - "epoch": 2.190184049079755, - "grad_norm": 2.9996821880340576, - "learning_rate": 4.434244422934976e-06, - "loss": 0.343, - "step": 357 - }, - { - "epoch": 2.196319018404908, - "grad_norm": 3.6373324394226074, - "learning_rate": 4.431188115537226e-06, - "loss": 0.5656, - "step": 358 - }, - { - "epoch": 2.2024539877300615, - "grad_norm": 4.667621612548828, - "learning_rate": 4.428124634344141e-06, - "loss": 0.2335, - "step": 359 - }, - { - "epoch": 2.208588957055215, - "grad_norm": 3.815484046936035, - "learning_rate": 4.425053990735653e-06, - "loss": 0.2173, - "step": 360 - }, - { - "epoch": 2.214723926380368, - "grad_norm": 4.689478874206543, - "learning_rate": 4.421976196118297e-06, - "loss": 0.5071, - "step": 361 - }, - { - "epoch": 2.2208588957055215, - "grad_norm": 4.016942024230957, - "learning_rate": 4.4188912619251765e-06, - "loss": 0.384, - "step": 362 - }, - { - "epoch": 2.226993865030675, - "grad_norm": 3.5336828231811523, - "learning_rate": 4.415799199615912e-06, - "loss": 0.3133, - "step": 363 - }, - { - "epoch": 2.233128834355828, - "grad_norm": 2.9195592403411865, - "learning_rate": 4.4127000206766055e-06, - "loss": 0.3847, - "step": 364 - }, - { - "epoch": 2.2392638036809815, - "grad_norm": 2.6843531131744385, - "learning_rate": 4.409593736619795e-06, - "loss": 0.3539, - "step": 365 - }, - { - "epoch": 2.245398773006135, - "grad_norm": 2.8692703247070312, - "learning_rate": 4.40648035898441e-06, - "loss": 0.3664, - "step": 366 - }, - { - "epoch": 2.2515337423312882, - "grad_norm": 2.820422649383545, - "learning_rate": 4.403359899335732e-06, - "loss": 0.4606, - "step": 367 - }, - { - "epoch": 2.2576687116564416, - "grad_norm": 3.8641669750213623, - "learning_rate": 4.400232369265351e-06, - "loss": 0.2931, - "step": 368 - }, - { - "epoch": 2.263803680981595, - "grad_norm": 2.75347638130188, - "learning_rate": 4.39709778039112e-06, - "loss": 0.3393, - "step": 369 - }, - { - "epoch": 2.2699386503067487, - "grad_norm": 15.150428771972656, - "learning_rate": 4.393956144357113e-06, - "loss": 0.65, - "step": 370 - }, - { - "epoch": 2.276073619631902, - "grad_norm": 2.4876065254211426, - "learning_rate": 4.390807472833585e-06, - "loss": 0.372, - "step": 371 - }, - { - "epoch": 2.2822085889570554, - "grad_norm": 2.7328054904937744, - "learning_rate": 4.3876517775169216e-06, - "loss": 0.2802, - "step": 372 - }, - { - "epoch": 2.2883435582822087, - "grad_norm": 2.903221368789673, - "learning_rate": 4.384489070129604e-06, - "loss": 0.1964, - "step": 373 - }, - { - "epoch": 2.294478527607362, - "grad_norm": 3.9368724822998047, - "learning_rate": 4.381319362420158e-06, - "loss": 0.4272, - "step": 374 - }, - { - "epoch": 2.3006134969325154, - "grad_norm": 5.431981086730957, - "learning_rate": 4.378142666163114e-06, - "loss": 0.4513, - "step": 375 - }, - { - "epoch": 2.3067484662576687, - "grad_norm": 3.661733627319336, - "learning_rate": 4.374958993158965e-06, - "loss": 0.6087, - "step": 376 - }, - { - "epoch": 2.312883435582822, - "grad_norm": 3.004450559616089, - "learning_rate": 4.371768355234116e-06, - "loss": 0.2206, - "step": 377 - }, - { - "epoch": 2.3190184049079754, - "grad_norm": 4.3785576820373535, - "learning_rate": 4.368570764240852e-06, - "loss": 0.6055, - "step": 378 - }, - { - "epoch": 2.3251533742331287, - "grad_norm": 3.4699394702911377, - "learning_rate": 4.365366232057279e-06, - "loss": 0.6286, - "step": 379 - }, - { - "epoch": 2.331288343558282, - "grad_norm": 2.6862998008728027, - "learning_rate": 4.3621547705872915e-06, - "loss": 0.2622, - "step": 380 - }, - { - "epoch": 2.3374233128834354, - "grad_norm": 3.056382179260254, - "learning_rate": 4.358936391760524e-06, - "loss": 0.3439, - "step": 381 - }, - { - "epoch": 2.3435582822085887, - "grad_norm": 2.6211307048797607, - "learning_rate": 4.355711107532305e-06, - "loss": 0.3677, - "step": 382 - }, - { - "epoch": 2.3496932515337425, - "grad_norm": 2.682060956954956, - "learning_rate": 4.3524789298836175e-06, - "loss": 0.3068, - "step": 383 - }, - { - "epoch": 2.355828220858896, - "grad_norm": 3.482539415359497, - "learning_rate": 4.349239870821049e-06, - "loss": 0.3737, - "step": 384 - }, - { - "epoch": 2.361963190184049, - "grad_norm": 2.8645472526550293, - "learning_rate": 4.345993942376752e-06, - "loss": 0.2837, - "step": 385 - }, - { - "epoch": 2.3680981595092025, - "grad_norm": 3.6142354011535645, - "learning_rate": 4.342741156608392e-06, - "loss": 0.6162, - "step": 386 - }, - { - "epoch": 2.374233128834356, - "grad_norm": 3.0748162269592285, - "learning_rate": 4.3394815255991135e-06, - "loss": 0.2986, - "step": 387 - }, - { - "epoch": 2.3803680981595092, - "grad_norm": 5.090906620025635, - "learning_rate": 4.336215061457485e-06, - "loss": 0.5383, - "step": 388 - }, - { - "epoch": 2.3865030674846626, - "grad_norm": 3.9235823154449463, - "learning_rate": 4.332941776317458e-06, - "loss": 0.4179, - "step": 389 - }, - { - "epoch": 2.392638036809816, - "grad_norm": 3.482926368713379, - "learning_rate": 4.329661682338325e-06, - "loss": 0.3938, - "step": 390 - }, - { - "epoch": 2.3987730061349692, - "grad_norm": 4.274583339691162, - "learning_rate": 4.32637479170467e-06, - "loss": 0.3349, - "step": 391 - }, - { - "epoch": 2.4049079754601226, - "grad_norm": 3.326012372970581, - "learning_rate": 4.323081116626322e-06, - "loss": 0.3336, - "step": 392 - }, - { - "epoch": 2.411042944785276, - "grad_norm": 3.174591541290283, - "learning_rate": 4.319780669338316e-06, - "loss": 0.2983, - "step": 393 - }, - { - "epoch": 2.4171779141104293, - "grad_norm": 3.9073634147644043, - "learning_rate": 4.31647346210084e-06, - "loss": 0.8401, - "step": 394 - }, - { - "epoch": 2.4233128834355826, - "grad_norm": 3.4787721633911133, - "learning_rate": 4.313159507199197e-06, - "loss": 0.2583, - "step": 395 - }, - { - "epoch": 2.4294478527607364, - "grad_norm": 3.19903564453125, - "learning_rate": 4.309838816943755e-06, - "loss": 0.2861, - "step": 396 - }, - { - "epoch": 2.4355828220858897, - "grad_norm": 3.184246778488159, - "learning_rate": 4.306511403669897e-06, - "loss": 0.2956, - "step": 397 - }, - { - "epoch": 2.441717791411043, - "grad_norm": 3.8991878032684326, - "learning_rate": 4.303177279737988e-06, - "loss": 0.5378, - "step": 398 - }, - { - "epoch": 2.4478527607361964, - "grad_norm": 3.411949872970581, - "learning_rate": 4.299836457533313e-06, - "loss": 0.3423, - "step": 399 - }, - { - "epoch": 2.4539877300613497, - "grad_norm": 3.445502996444702, - "learning_rate": 4.296488949466046e-06, - "loss": 0.5608, - "step": 400 - }, - { - "epoch": 2.460122699386503, - "grad_norm": 3.066798210144043, - "learning_rate": 4.293134767971193e-06, - "loss": 0.3214, - "step": 401 - }, - { - "epoch": 2.4662576687116564, - "grad_norm": 3.0581583976745605, - "learning_rate": 4.28977392550855e-06, - "loss": 0.5117, - "step": 402 - }, - { - "epoch": 2.4723926380368098, - "grad_norm": 4.207413673400879, - "learning_rate": 4.286406434562659e-06, - "loss": 0.2666, - "step": 403 - }, - { - "epoch": 2.478527607361963, - "grad_norm": 2.9934990406036377, - "learning_rate": 4.283032307642756e-06, - "loss": 0.2878, - "step": 404 - }, - { - "epoch": 2.4846625766871164, - "grad_norm": 3.800593614578247, - "learning_rate": 4.2796515572827305e-06, - "loss": 0.2619, - "step": 405 - }, - { - "epoch": 2.4907975460122698, - "grad_norm": 3.2029523849487305, - "learning_rate": 4.276264196041074e-06, - "loss": 0.1735, - "step": 406 - }, - { - "epoch": 2.4969325153374236, - "grad_norm": 3.515634059906006, - "learning_rate": 4.2728702365008356e-06, - "loss": 0.4741, - "step": 407 - }, - { - "epoch": 2.5030674846625764, - "grad_norm": 3.8354873657226562, - "learning_rate": 4.269469691269577e-06, - "loss": 0.3713, - "step": 408 - }, - { - "epoch": 2.5092024539877302, - "grad_norm": 3.902904510498047, - "learning_rate": 4.266062572979323e-06, - "loss": 0.5189, - "step": 409 - }, - { - "epoch": 2.5153374233128836, - "grad_norm": 3.3276097774505615, - "learning_rate": 4.262648894286515e-06, - "loss": 0.2461, - "step": 410 - }, - { - "epoch": 2.521472392638037, - "grad_norm": 2.9457011222839355, - "learning_rate": 4.259228667871963e-06, - "loss": 0.3013, - "step": 411 - }, - { - "epoch": 2.5276073619631902, - "grad_norm": 2.8941617012023926, - "learning_rate": 4.255801906440803e-06, - "loss": 0.2784, - "step": 412 - }, - { - "epoch": 2.5337423312883436, - "grad_norm": 2.949399471282959, - "learning_rate": 4.252368622722443e-06, - "loss": 0.457, - "step": 413 - }, - { - "epoch": 2.539877300613497, - "grad_norm": 3.342108726501465, - "learning_rate": 4.248928829470522e-06, - "loss": 0.487, - "step": 414 - }, - { - "epoch": 2.5460122699386503, - "grad_norm": 3.9556386470794678, - "learning_rate": 4.245482539462861e-06, - "loss": 0.6118, - "step": 415 - }, - { - "epoch": 2.5521472392638036, - "grad_norm": 3.6936280727386475, - "learning_rate": 4.242029765501411e-06, - "loss": 0.6131, - "step": 416 - }, - { - "epoch": 2.558282208588957, - "grad_norm": 2.79897403717041, - "learning_rate": 4.2385705204122104e-06, - "loss": 0.4209, - "step": 417 - }, - { - "epoch": 2.5644171779141103, - "grad_norm": 4.093318462371826, - "learning_rate": 4.235104817045338e-06, - "loss": 0.5375, - "step": 418 - }, - { - "epoch": 2.5705521472392636, - "grad_norm": 3.138263463973999, - "learning_rate": 4.231632668274861e-06, - "loss": 0.4682, - "step": 419 - }, - { - "epoch": 2.5766871165644174, - "grad_norm": 3.1465651988983154, - "learning_rate": 4.22815408699879e-06, - "loss": 0.2522, - "step": 420 - }, - { - "epoch": 2.5828220858895703, - "grad_norm": 3.5166101455688477, - "learning_rate": 4.22466908613903e-06, - "loss": 0.4776, - "step": 421 - }, - { - "epoch": 2.588957055214724, - "grad_norm": 2.8498189449310303, - "learning_rate": 4.221177678641333e-06, - "loss": 0.3067, - "step": 422 - }, - { - "epoch": 2.5950920245398774, - "grad_norm": 2.8046035766601562, - "learning_rate": 4.217679877475251e-06, - "loss": 0.2402, - "step": 423 - }, - { - "epoch": 2.6012269938650308, - "grad_norm": 4.204788684844971, - "learning_rate": 4.214175695634084e-06, - "loss": 0.2608, - "step": 424 - }, - { - "epoch": 2.607361963190184, - "grad_norm": 2.5569400787353516, - "learning_rate": 4.210665146134838e-06, - "loss": 0.2801, - "step": 425 - }, - { - "epoch": 2.6134969325153374, - "grad_norm": 3.5359091758728027, - "learning_rate": 4.20714824201817e-06, - "loss": 0.2027, - "step": 426 - }, - { - "epoch": 2.6196319018404908, - "grad_norm": 3.5132668018341064, - "learning_rate": 4.203624996348343e-06, - "loss": 0.4253, - "step": 427 - }, - { - "epoch": 2.625766871165644, - "grad_norm": 3.5076472759246826, - "learning_rate": 4.200095422213177e-06, - "loss": 0.3014, - "step": 428 - }, - { - "epoch": 2.6319018404907975, - "grad_norm": 3.6501238346099854, - "learning_rate": 4.196559532724004e-06, - "loss": 0.6526, - "step": 429 - }, - { - "epoch": 2.638036809815951, - "grad_norm": 2.849924325942993, - "learning_rate": 4.193017341015608e-06, - "loss": 0.4487, - "step": 430 - }, - { - "epoch": 2.644171779141104, - "grad_norm": 3.2228448390960693, - "learning_rate": 4.189468860246192e-06, - "loss": 0.5386, - "step": 431 - }, - { - "epoch": 2.6503067484662575, - "grad_norm": 2.532102108001709, - "learning_rate": 4.185914103597316e-06, - "loss": 0.3034, - "step": 432 - }, - { - "epoch": 2.6564417177914113, - "grad_norm": 2.862720251083374, - "learning_rate": 4.182353084273855e-06, - "loss": 0.5862, - "step": 433 - }, - { - "epoch": 2.662576687116564, - "grad_norm": 3.4617464542388916, - "learning_rate": 4.178785815503946e-06, - "loss": 0.3954, - "step": 434 - }, - { - "epoch": 2.668711656441718, - "grad_norm": 2.627758741378784, - "learning_rate": 4.1752123105389444e-06, - "loss": 0.4367, - "step": 435 - }, - { - "epoch": 2.6748466257668713, - "grad_norm": 3.2868380546569824, - "learning_rate": 4.171632582653368e-06, - "loss": 0.2997, - "step": 436 - }, - { - "epoch": 2.6809815950920246, - "grad_norm": 3.4260897636413574, - "learning_rate": 4.168046645144851e-06, - "loss": 0.3354, - "step": 437 - }, - { - "epoch": 2.687116564417178, - "grad_norm": 3.1415748596191406, - "learning_rate": 4.164454511334098e-06, - "loss": 0.5538, - "step": 438 - }, - { - "epoch": 2.6932515337423313, - "grad_norm": 3.3700919151306152, - "learning_rate": 4.160856194564828e-06, - "loss": 0.5731, - "step": 439 - }, - { - "epoch": 2.6993865030674846, - "grad_norm": 3.146968364715576, - "learning_rate": 4.157251708203728e-06, - "loss": 0.4429, - "step": 440 - }, - { - "epoch": 2.705521472392638, - "grad_norm": 3.7495830059051514, - "learning_rate": 4.153641065640402e-06, - "loss": 0.6361, - "step": 441 - }, - { - "epoch": 2.7116564417177913, - "grad_norm": 3.426499128341675, - "learning_rate": 4.150024280287327e-06, - "loss": 0.2418, - "step": 442 - }, - { - "epoch": 2.7177914110429446, - "grad_norm": 3.213719606399536, - "learning_rate": 4.146401365579795e-06, - "loss": 0.2549, - "step": 443 - }, - { - "epoch": 2.7239263803680984, - "grad_norm": 3.457742929458618, - "learning_rate": 4.142772334975868e-06, - "loss": 0.3822, - "step": 444 - }, - { - "epoch": 2.7300613496932513, - "grad_norm": 3.130410671234131, - "learning_rate": 4.139137201956324e-06, - "loss": 0.3107, - "step": 445 - }, - { - "epoch": 2.736196319018405, - "grad_norm": 2.7337112426757812, - "learning_rate": 4.1354959800246155e-06, - "loss": 0.2829, - "step": 446 - }, - { - "epoch": 2.7423312883435584, - "grad_norm": 3.427006483078003, - "learning_rate": 4.131848682706807e-06, - "loss": 0.3045, - "step": 447 - }, - { - "epoch": 2.7484662576687118, - "grad_norm": 3.3742318153381348, - "learning_rate": 4.128195323551536e-06, - "loss": 0.316, - "step": 448 - }, - { - "epoch": 2.754601226993865, - "grad_norm": 3.086738109588623, - "learning_rate": 4.1245359161299555e-06, - "loss": 0.5278, - "step": 449 - }, - { - "epoch": 2.7607361963190185, - "grad_norm": 3.4609954357147217, - "learning_rate": 4.120870474035687e-06, - "loss": 0.447, - "step": 450 - }, - { - "epoch": 2.766871165644172, - "grad_norm": 3.552663803100586, - "learning_rate": 4.1171990108847705e-06, - "loss": 0.6127, - "step": 451 - }, - { - "epoch": 2.773006134969325, - "grad_norm": 4.413427352905273, - "learning_rate": 4.113521540315609e-06, - "loss": 0.3304, - "step": 452 - }, - { - "epoch": 2.7791411042944785, - "grad_norm": 3.3408143520355225, - "learning_rate": 4.109838075988922e-06, - "loss": 0.5871, - "step": 453 - }, - { - "epoch": 2.785276073619632, - "grad_norm": 3.0659773349761963, - "learning_rate": 4.106148631587697e-06, - "loss": 0.3578, - "step": 454 - }, - { - "epoch": 2.791411042944785, - "grad_norm": 3.2854816913604736, - "learning_rate": 4.102453220817134e-06, - "loss": 0.4685, - "step": 455 - }, - { - "epoch": 2.7975460122699385, - "grad_norm": 3.4940855503082275, - "learning_rate": 4.098751857404595e-06, - "loss": 0.2818, - "step": 456 - }, - { - "epoch": 2.8036809815950923, - "grad_norm": 2.4630730152130127, - "learning_rate": 4.0950445550995566e-06, - "loss": 0.3497, - "step": 457 - }, - { - "epoch": 2.809815950920245, - "grad_norm": 3.3870959281921387, - "learning_rate": 4.091331327673554e-06, - "loss": 0.4954, - "step": 458 - }, - { - "epoch": 2.815950920245399, - "grad_norm": 2.3676836490631104, - "learning_rate": 4.087612188920135e-06, - "loss": 0.3884, - "step": 459 - }, - { - "epoch": 2.8220858895705523, - "grad_norm": 3.2477807998657227, - "learning_rate": 4.083887152654804e-06, - "loss": 0.375, - "step": 460 - }, - { - "epoch": 2.8282208588957056, - "grad_norm": 3.295673131942749, - "learning_rate": 4.080156232714976e-06, - "loss": 0.3272, - "step": 461 - }, - { - "epoch": 2.834355828220859, - "grad_norm": 2.800847291946411, - "learning_rate": 4.07641944295992e-06, - "loss": 0.2936, - "step": 462 - }, - { - "epoch": 2.8404907975460123, - "grad_norm": 3.443336009979248, - "learning_rate": 4.072676797270708e-06, - "loss": 0.2363, - "step": 463 - }, - { - "epoch": 2.8466257668711656, - "grad_norm": 3.1334242820739746, - "learning_rate": 4.0689283095501684e-06, - "loss": 0.4827, - "step": 464 - }, - { - "epoch": 2.852760736196319, - "grad_norm": 3.950672149658203, - "learning_rate": 4.06517399372283e-06, - "loss": 0.3163, - "step": 465 - }, - { - "epoch": 2.8588957055214723, - "grad_norm": 4.243579387664795, - "learning_rate": 4.061413863734869e-06, - "loss": 0.2827, - "step": 466 - }, - { - "epoch": 2.8650306748466257, - "grad_norm": 4.076017379760742, - "learning_rate": 4.057647933554063e-06, - "loss": 0.3466, - "step": 467 - }, - { - "epoch": 2.871165644171779, - "grad_norm": 2.846989631652832, - "learning_rate": 4.053876217169734e-06, - "loss": 0.4632, - "step": 468 - }, - { - "epoch": 2.8773006134969323, - "grad_norm": 2.74981689453125, - "learning_rate": 4.050098728592698e-06, - "loss": 0.2001, - "step": 469 - }, - { - "epoch": 2.883435582822086, - "grad_norm": 3.062068462371826, - "learning_rate": 4.046315481855211e-06, - "loss": 0.5425, - "step": 470 - }, - { - "epoch": 2.889570552147239, - "grad_norm": 2.8630964756011963, - "learning_rate": 4.0425264910109245e-06, - "loss": 0.424, - "step": 471 - }, - { - "epoch": 2.895705521472393, - "grad_norm": 3.537442922592163, - "learning_rate": 4.03873177013482e-06, - "loss": 0.2443, - "step": 472 - }, - { - "epoch": 2.901840490797546, - "grad_norm": 3.128535270690918, - "learning_rate": 4.034931333323173e-06, - "loss": 0.3734, - "step": 473 - }, - { - "epoch": 2.9079754601226995, - "grad_norm": 3.021897792816162, - "learning_rate": 4.031125194693484e-06, - "loss": 0.3762, - "step": 474 - }, - { - "epoch": 2.914110429447853, - "grad_norm": 3.0943546295166016, - "learning_rate": 4.0273133683844375e-06, - "loss": 0.3721, - "step": 475 - }, - { - "epoch": 2.920245398773006, - "grad_norm": 3.443448305130005, - "learning_rate": 4.023495868555848e-06, - "loss": 0.2868, - "step": 476 - }, - { - "epoch": 2.9263803680981595, - "grad_norm": 2.865227222442627, - "learning_rate": 4.0196727093886024e-06, - "loss": 0.5086, - "step": 477 - }, - { - "epoch": 2.932515337423313, - "grad_norm": 3.1272058486938477, - "learning_rate": 4.015843905084612e-06, - "loss": 0.4616, - "step": 478 - }, - { - "epoch": 2.938650306748466, - "grad_norm": 3.0584447383880615, - "learning_rate": 4.012009469866756e-06, - "loss": 0.403, - "step": 479 - }, - { - "epoch": 2.9447852760736195, - "grad_norm": 4.42616081237793, - "learning_rate": 4.008169417978836e-06, - "loss": 0.5801, - "step": 480 - }, - { - "epoch": 2.950920245398773, - "grad_norm": 2.8444535732269287, - "learning_rate": 4.004323763685511e-06, - "loss": 0.5808, - "step": 481 - }, - { - "epoch": 2.957055214723926, - "grad_norm": 2.591719627380371, - "learning_rate": 4.0004725212722565e-06, - "loss": 0.2584, - "step": 482 - }, - { - "epoch": 2.96319018404908, - "grad_norm": 2.5496113300323486, - "learning_rate": 3.996615705045302e-06, - "loss": 0.462, - "step": 483 - }, - { - "epoch": 2.969325153374233, - "grad_norm": 2.9932925701141357, - "learning_rate": 3.992753329331588e-06, - "loss": 0.3502, - "step": 484 - }, - { - "epoch": 2.9754601226993866, - "grad_norm": 3.136871337890625, - "learning_rate": 3.9888854084786995e-06, - "loss": 0.5989, - "step": 485 - }, - { - "epoch": 2.98159509202454, - "grad_norm": 3.6654274463653564, - "learning_rate": 3.985011956854826e-06, - "loss": 0.6772, - "step": 486 - }, - { - "epoch": 2.9877300613496933, - "grad_norm": 2.5398948192596436, - "learning_rate": 3.9811329888487004e-06, - "loss": 0.4192, - "step": 487 - }, - { - "epoch": 2.9938650306748467, - "grad_norm": 4.89943790435791, - "learning_rate": 3.977248518869545e-06, - "loss": 0.4031, - "step": 488 - }, - { - "epoch": 3.0, - "grad_norm": 3.4729995727539062, - "learning_rate": 3.973358561347024e-06, - "loss": 0.7764, - "step": 489 - }, - { - "epoch": 3.0061349693251533, - "grad_norm": 5.331607818603516, - "learning_rate": 3.969463130731183e-06, - "loss": 0.3267, - "step": 490 - }, - { - "epoch": 3.0122699386503067, - "grad_norm": 3.453650712966919, - "learning_rate": 3.965562241492401e-06, - "loss": 0.2719, - "step": 491 - }, - { - "epoch": 3.01840490797546, - "grad_norm": 3.232313632965088, - "learning_rate": 3.9616559081213335e-06, - "loss": 0.1825, - "step": 492 - }, - { - "epoch": 3.0245398773006134, - "grad_norm": 3.4860260486602783, - "learning_rate": 3.957744145128858e-06, - "loss": 0.1854, - "step": 493 - }, - { - "epoch": 3.0306748466257667, - "grad_norm": 3.4357805252075195, - "learning_rate": 3.953826967046021e-06, - "loss": 0.2224, - "step": 494 - }, - { - "epoch": 3.03680981595092, - "grad_norm": 4.557503700256348, - "learning_rate": 3.9499043884239894e-06, - "loss": 0.349, - "step": 495 - }, - { - "epoch": 3.042944785276074, - "grad_norm": 4.685214042663574, - "learning_rate": 3.945976423833987e-06, - "loss": 0.175, - "step": 496 - }, - { - "epoch": 3.049079754601227, - "grad_norm": 3.7430171966552734, - "learning_rate": 3.942043087867244e-06, - "loss": 0.2773, - "step": 497 - }, - { - "epoch": 3.0552147239263805, - "grad_norm": 3.756450653076172, - "learning_rate": 3.938104395134947e-06, - "loss": 0.4445, - "step": 498 - }, - { - "epoch": 3.061349693251534, - "grad_norm": 4.049175262451172, - "learning_rate": 3.9341603602681805e-06, - "loss": 0.3046, - "step": 499 - }, - { - "epoch": 3.067484662576687, - "grad_norm": 3.7689461708068848, - "learning_rate": 3.930210997917871e-06, - "loss": 0.2544, - "step": 500 - }, - { - "epoch": 3.0736196319018405, - "grad_norm": 4.027602195739746, - "learning_rate": 3.92625632275474e-06, - "loss": 0.3154, - "step": 501 - }, - { - "epoch": 3.079754601226994, - "grad_norm": 2.8449292182922363, - "learning_rate": 3.922296349469239e-06, - "loss": 0.2804, - "step": 502 - }, - { - "epoch": 3.085889570552147, - "grad_norm": 2.9555234909057617, - "learning_rate": 3.918331092771505e-06, - "loss": 0.2393, - "step": 503 - }, - { - "epoch": 3.0920245398773005, - "grad_norm": 2.621042013168335, - "learning_rate": 3.914360567391296e-06, - "loss": 0.1403, - "step": 504 - }, - { - "epoch": 3.098159509202454, - "grad_norm": 3.2348620891571045, - "learning_rate": 3.910384788077949e-06, - "loss": 0.1537, - "step": 505 - }, - { - "epoch": 3.104294478527607, - "grad_norm": 3.030179977416992, - "learning_rate": 3.906403769600311e-06, - "loss": 0.2921, - "step": 506 - }, - { - "epoch": 3.1104294478527605, - "grad_norm": 3.146428346633911, - "learning_rate": 3.902417526746694e-06, - "loss": 0.2036, - "step": 507 - }, - { - "epoch": 3.116564417177914, - "grad_norm": 3.6201512813568115, - "learning_rate": 3.898426074324818e-06, - "loss": 0.2655, - "step": 508 - }, - { - "epoch": 3.1226993865030677, - "grad_norm": 3.7674012184143066, - "learning_rate": 3.8944294271617524e-06, - "loss": 0.3938, - "step": 509 - }, - { - "epoch": 3.128834355828221, - "grad_norm": 4.54722785949707, - "learning_rate": 3.890427600103865e-06, - "loss": 0.3051, - "step": 510 - }, - { - "epoch": 3.1349693251533743, - "grad_norm": 4.228236675262451, - "learning_rate": 3.886420608016767e-06, - "loss": 0.3719, - "step": 511 - }, - { - "epoch": 3.1411042944785277, - "grad_norm": 4.355110168457031, - "learning_rate": 3.882408465785252e-06, - "loss": 0.1863, - "step": 512 - }, - { - "epoch": 3.147239263803681, - "grad_norm": 3.451460838317871, - "learning_rate": 3.878391188313249e-06, - "loss": 0.1479, - "step": 513 - }, - { - "epoch": 3.1533742331288344, - "grad_norm": 4.395524501800537, - "learning_rate": 3.87436879052376e-06, - "loss": 0.238, - "step": 514 - }, - { - "epoch": 3.1595092024539877, - "grad_norm": 2.940717935562134, - "learning_rate": 3.870341287358809e-06, - "loss": 0.2069, - "step": 515 - }, - { - "epoch": 3.165644171779141, - "grad_norm": 2.5817320346832275, - "learning_rate": 3.8663086937793845e-06, - "loss": 0.1189, - "step": 516 - }, - { - "epoch": 3.1717791411042944, - "grad_norm": 3.9863343238830566, - "learning_rate": 3.862271024765385e-06, - "loss": 0.3434, - "step": 517 - }, - { - "epoch": 3.1779141104294477, - "grad_norm": 3.609004259109497, - "learning_rate": 3.8582282953155626e-06, - "loss": 0.1602, - "step": 518 - }, - { - "epoch": 3.184049079754601, - "grad_norm": 3.207533121109009, - "learning_rate": 3.854180520447465e-06, - "loss": 0.3452, - "step": 519 - }, - { - "epoch": 3.190184049079755, - "grad_norm": 3.593388795852661, - "learning_rate": 3.850127715197387e-06, - "loss": 0.2832, - "step": 520 - }, - { - "epoch": 3.196319018404908, - "grad_norm": 3.409064531326294, - "learning_rate": 3.846069894620306e-06, - "loss": 0.1481, - "step": 521 - }, - { - "epoch": 3.2024539877300615, - "grad_norm": 3.461498737335205, - "learning_rate": 3.84200707378983e-06, - "loss": 0.1283, - "step": 522 - }, - { - "epoch": 3.208588957055215, - "grad_norm": 3.708467483520508, - "learning_rate": 3.8379392677981434e-06, - "loss": 0.2468, - "step": 523 - }, - { - "epoch": 3.214723926380368, - "grad_norm": 2.802381753921509, - "learning_rate": 3.833866491755947e-06, - "loss": 0.2685, - "step": 524 - }, - { - "epoch": 3.2208588957055215, - "grad_norm": 3.0787744522094727, - "learning_rate": 3.8297887607924044e-06, - "loss": 0.2595, - "step": 525 - }, - { - "epoch": 3.226993865030675, - "grad_norm": 3.3952548503875732, - "learning_rate": 3.825706090055088e-06, - "loss": 0.4099, - "step": 526 - }, - { - "epoch": 3.233128834355828, - "grad_norm": 3.3497085571289062, - "learning_rate": 3.821618494709916e-06, - "loss": 0.287, - "step": 527 - }, - { - "epoch": 3.2392638036809815, - "grad_norm": 4.050611972808838, - "learning_rate": 3.817525989941102e-06, - "loss": 0.2369, - "step": 528 - }, - { - "epoch": 3.245398773006135, - "grad_norm": 2.87642240524292, - "learning_rate": 3.8134285909510972e-06, - "loss": 0.2751, - "step": 529 - }, - { - "epoch": 3.2515337423312882, - "grad_norm": 3.821941614151001, - "learning_rate": 3.8093263129605305e-06, - "loss": 0.2363, - "step": 530 - }, - { - "epoch": 3.2576687116564416, - "grad_norm": 2.8066117763519287, - "learning_rate": 3.80521917120816e-06, - "loss": 0.094, - "step": 531 - }, - { - "epoch": 3.263803680981595, - "grad_norm": 3.849768877029419, - "learning_rate": 3.801107180950806e-06, - "loss": 0.4117, - "step": 532 - }, - { - "epoch": 3.2699386503067487, - "grad_norm": 2.4161250591278076, - "learning_rate": 3.7969903574633028e-06, - "loss": 0.1183, - "step": 533 - }, - { - "epoch": 3.276073619631902, - "grad_norm": 3.6743111610412598, - "learning_rate": 3.792868716038437e-06, - "loss": 0.2296, - "step": 534 - }, - { - "epoch": 3.2822085889570554, - "grad_norm": 4.378123760223389, - "learning_rate": 3.7887422719868937e-06, - "loss": 0.2678, - "step": 535 - }, - { - "epoch": 3.2883435582822087, - "grad_norm": 4.816481590270996, - "learning_rate": 3.784611040637198e-06, - "loss": 0.4887, - "step": 536 - }, - { - "epoch": 3.294478527607362, - "grad_norm": 3.5712430477142334, - "learning_rate": 3.7804750373356576e-06, - "loss": 0.3827, - "step": 537 - }, - { - "epoch": 3.3006134969325154, - "grad_norm": 3.6877355575561523, - "learning_rate": 3.776334277446307e-06, - "loss": 0.3233, - "step": 538 - }, - { - "epoch": 3.3067484662576687, - "grad_norm": 3.442706346511841, - "learning_rate": 3.7721887763508512e-06, - "loss": 0.1256, - "step": 539 - }, - { - "epoch": 3.312883435582822, - "grad_norm": 3.9265615940093994, - "learning_rate": 3.7680385494486053e-06, - "loss": 0.3845, - "step": 540 - }, - { - "epoch": 3.3190184049079754, - "grad_norm": 3.5030126571655273, - "learning_rate": 3.7638836121564414e-06, - "loss": 0.2905, - "step": 541 - }, - { - "epoch": 3.3251533742331287, - "grad_norm": 3.6685378551483154, - "learning_rate": 3.7597239799087283e-06, - "loss": 0.3561, - "step": 542 - }, - { - "epoch": 3.331288343558282, - "grad_norm": 3.8484046459198, - "learning_rate": 3.7555596681572736e-06, - "loss": 0.1157, - "step": 543 - }, - { - "epoch": 3.3374233128834354, - "grad_norm": 3.7977402210235596, - "learning_rate": 3.751390692371272e-06, - "loss": 0.3049, - "step": 544 - }, - { - "epoch": 3.3435582822085887, - "grad_norm": 3.4409852027893066, - "learning_rate": 3.7472170680372398e-06, - "loss": 0.1626, - "step": 545 - }, - { - "epoch": 3.3496932515337425, - "grad_norm": 3.801541328430176, - "learning_rate": 3.7430388106589632e-06, - "loss": 0.2414, - "step": 546 - }, - { - "epoch": 3.355828220858896, - "grad_norm": 4.025203704833984, - "learning_rate": 3.738855935757438e-06, - "loss": 0.3441, - "step": 547 - }, - { - "epoch": 3.361963190184049, - "grad_norm": 4.242798805236816, - "learning_rate": 3.7346684588708135e-06, - "loss": 0.5244, - "step": 548 - }, - { - "epoch": 3.3680981595092025, - "grad_norm": 3.0516819953918457, - "learning_rate": 3.7304763955543332e-06, - "loss": 0.1984, - "step": 549 - }, - { - "epoch": 3.374233128834356, - "grad_norm": 3.894667625427246, - "learning_rate": 3.726279761380279e-06, - "loss": 0.2715, - "step": 550 - }, - { - "epoch": 3.3803680981595092, - "grad_norm": 3.171208143234253, - "learning_rate": 3.72207857193791e-06, - "loss": 0.1537, - "step": 551 - }, - { - "epoch": 3.3865030674846626, - "grad_norm": 4.344860553741455, - "learning_rate": 3.7178728428334092e-06, - "loss": 0.2388, - "step": 552 - }, - { - "epoch": 3.392638036809816, - "grad_norm": 2.766317367553711, - "learning_rate": 3.7136625896898226e-06, - "loss": 0.1726, - "step": 553 - }, - { - "epoch": 3.3987730061349692, - "grad_norm": 3.550662040710449, - "learning_rate": 3.7094478281470003e-06, - "loss": 0.2942, - "step": 554 - }, - { - "epoch": 3.4049079754601226, - "grad_norm": 3.4576945304870605, - "learning_rate": 3.7052285738615412e-06, - "loss": 0.1665, - "step": 555 - }, - { - "epoch": 3.411042944785276, - "grad_norm": 4.026793003082275, - "learning_rate": 3.7010048425067317e-06, - "loss": 0.3954, - "step": 556 - }, - { - "epoch": 3.4171779141104293, - "grad_norm": 4.600133419036865, - "learning_rate": 3.696776649772492e-06, - "loss": 0.3207, - "step": 557 - }, - { - "epoch": 3.4233128834355826, - "grad_norm": 4.747331142425537, - "learning_rate": 3.692544011365312e-06, - "loss": 0.1325, - "step": 558 - }, - { - "epoch": 3.4294478527607364, - "grad_norm": 3.781464099884033, - "learning_rate": 3.6883069430081986e-06, - "loss": 0.1644, - "step": 559 - }, - { - "epoch": 3.4355828220858897, - "grad_norm": 2.905986785888672, - "learning_rate": 3.6840654604406135e-06, - "loss": 0.2469, - "step": 560 - }, - { - "epoch": 3.441717791411043, - "grad_norm": 2.3747711181640625, - "learning_rate": 3.679819579418414e-06, - "loss": 0.1146, - "step": 561 - }, - { - "epoch": 3.4478527607361964, - "grad_norm": 3.2683632373809814, - "learning_rate": 3.6755693157137995e-06, - "loss": 0.3236, - "step": 562 - }, - { - "epoch": 3.4539877300613497, - "grad_norm": 3.7750496864318848, - "learning_rate": 3.6713146851152487e-06, - "loss": 0.399, - "step": 563 - }, - { - "epoch": 3.460122699386503, - "grad_norm": 3.3912384510040283, - "learning_rate": 3.667055703427461e-06, - "loss": 0.1259, - "step": 564 - }, - { - "epoch": 3.4662576687116564, - "grad_norm": 3.0224430561065674, - "learning_rate": 3.6627923864713e-06, - "loss": 0.1835, - "step": 565 - }, - { - "epoch": 3.4723926380368098, - "grad_norm": 3.642258405685425, - "learning_rate": 3.658524750083733e-06, - "loss": 0.2763, - "step": 566 - }, - { - "epoch": 3.478527607361963, - "grad_norm": 3.409890651702881, - "learning_rate": 3.654252810117773e-06, - "loss": 0.2496, - "step": 567 - }, - { - "epoch": 3.4846625766871164, - "grad_norm": 3.0416476726531982, - "learning_rate": 3.6499765824424195e-06, - "loss": 0.1287, - "step": 568 - }, - { - "epoch": 3.4907975460122698, - "grad_norm": 3.1963987350463867, - "learning_rate": 3.6456960829425987e-06, - "loss": 0.1747, - "step": 569 - }, - { - "epoch": 3.4969325153374236, - "grad_norm": 3.198448657989502, - "learning_rate": 3.641411327519107e-06, - "loss": 0.1913, - "step": 570 - }, - { - "epoch": 3.5030674846625764, - "grad_norm": 3.7023441791534424, - "learning_rate": 3.6371223320885492e-06, - "loss": 0.3224, - "step": 571 - }, - { - "epoch": 3.5092024539877302, - "grad_norm": 4.54288387298584, - "learning_rate": 3.6328291125832803e-06, - "loss": 0.2364, - "step": 572 - }, - { - "epoch": 3.5153374233128836, - "grad_norm": 3.5064890384674072, - "learning_rate": 3.628531684951347e-06, - "loss": 0.2552, - "step": 573 - }, - { - "epoch": 3.521472392638037, - "grad_norm": 3.987583875656128, - "learning_rate": 3.6242300651564276e-06, - "loss": 0.3232, - "step": 574 - }, - { - "epoch": 3.5276073619631902, - "grad_norm": 3.179642915725708, - "learning_rate": 3.6199242691777745e-06, - "loss": 0.32, - "step": 575 - }, - { - "epoch": 3.5337423312883436, - "grad_norm": 3.3078157901763916, - "learning_rate": 3.6156143130101516e-06, - "loss": 0.2922, - "step": 576 - }, - { - "epoch": 3.539877300613497, - "grad_norm": 3.1628613471984863, - "learning_rate": 3.6113002126637765e-06, - "loss": 0.2005, - "step": 577 - }, - { - "epoch": 3.5460122699386503, - "grad_norm": 3.4515540599823, - "learning_rate": 3.606981984164263e-06, - "loss": 0.2138, - "step": 578 - }, - { - "epoch": 3.5521472392638036, - "grad_norm": 5.132473945617676, - "learning_rate": 3.6026596435525578e-06, - "loss": 0.4382, - "step": 579 - }, - { - "epoch": 3.558282208588957, - "grad_norm": 3.397614002227783, - "learning_rate": 3.5983332068848855e-06, - "loss": 0.3326, - "step": 580 - }, - { - "epoch": 3.5644171779141103, - "grad_norm": 4.79497766494751, - "learning_rate": 3.5940026902326825e-06, - "loss": 0.4748, - "step": 581 - }, - { - "epoch": 3.5705521472392636, - "grad_norm": 3.7675018310546875, - "learning_rate": 3.5896681096825446e-06, - "loss": 0.2692, - "step": 582 - }, - { - "epoch": 3.5766871165644174, - "grad_norm": 3.0637521743774414, - "learning_rate": 3.5853294813361614e-06, - "loss": 0.3658, - "step": 583 - }, - { - "epoch": 3.5828220858895703, - "grad_norm": 2.8949790000915527, - "learning_rate": 3.5809868213102623e-06, - "loss": 0.1661, - "step": 584 - }, - { - "epoch": 3.588957055214724, - "grad_norm": 3.163419246673584, - "learning_rate": 3.5766401457365485e-06, - "loss": 0.1233, - "step": 585 - }, - { - "epoch": 3.5950920245398774, - "grad_norm": 3.1787965297698975, - "learning_rate": 3.5722894707616417e-06, - "loss": 0.278, - "step": 586 - }, - { - "epoch": 3.6012269938650308, - "grad_norm": 2.9397857189178467, - "learning_rate": 3.5679348125470175e-06, - "loss": 0.1541, - "step": 587 - }, - { - "epoch": 3.607361963190184, - "grad_norm": 3.2690396308898926, - "learning_rate": 3.56357618726895e-06, - "loss": 0.1575, - "step": 588 - }, - { - "epoch": 3.6134969325153374, - "grad_norm": 5.444014072418213, - "learning_rate": 3.5592136111184483e-06, - "loss": 0.8079, - "step": 589 - }, - { - "epoch": 3.6196319018404908, - "grad_norm": 3.1688313484191895, - "learning_rate": 3.554847100301199e-06, - "loss": 0.341, - "step": 590 - }, - { - "epoch": 3.625766871165644, - "grad_norm": 2.469212532043457, - "learning_rate": 3.550476671037505e-06, - "loss": 0.1625, - "step": 591 - }, - { - "epoch": 3.6319018404907975, - "grad_norm": 3.3956527709960938, - "learning_rate": 3.546102339562223e-06, - "loss": 0.199, - "step": 592 - }, - { - "epoch": 3.638036809815951, - "grad_norm": 2.7287702560424805, - "learning_rate": 3.5417241221247078e-06, - "loss": 0.1493, - "step": 593 - }, - { - "epoch": 3.644171779141104, - "grad_norm": 3.5046865940093994, - "learning_rate": 3.5373420349887477e-06, - "loss": 0.2765, - "step": 594 - }, - { - "epoch": 3.6503067484662575, - "grad_norm": 3.121476650238037, - "learning_rate": 3.5329560944325065e-06, - "loss": 0.2833, - "step": 595 - }, - { - "epoch": 3.6564417177914113, - "grad_norm": 3.276463270187378, - "learning_rate": 3.528566316748462e-06, - "loss": 0.1237, - "step": 596 - }, - { - "epoch": 3.662576687116564, - "grad_norm": 3.382840633392334, - "learning_rate": 3.524172718243347e-06, - "loss": 0.1599, - "step": 597 - }, - { - "epoch": 3.668711656441718, - "grad_norm": 4.801311492919922, - "learning_rate": 3.5197753152380854e-06, - "loss": 0.2997, - "step": 598 - }, - { - "epoch": 3.6748466257668713, - "grad_norm": 4.117336273193359, - "learning_rate": 3.515374124067736e-06, - "loss": 0.2021, - "step": 599 - }, - { - "epoch": 3.6809815950920246, - "grad_norm": 3.611438035964966, - "learning_rate": 3.5109691610814263e-06, - "loss": 0.1726, - "step": 600 - }, - { - "epoch": 3.687116564417178, - "grad_norm": 4.5179972648620605, - "learning_rate": 3.5065604426422995e-06, - "loss": 0.1377, - "step": 601 - }, - { - "epoch": 3.6932515337423313, - "grad_norm": 3.561061382293701, - "learning_rate": 3.502147985127445e-06, - "loss": 0.1497, - "step": 602 - }, - { - "epoch": 3.6993865030674846, - "grad_norm": 3.3497917652130127, - "learning_rate": 3.4977318049278443e-06, - "loss": 0.1589, - "step": 603 - }, - { - "epoch": 3.705521472392638, - "grad_norm": 3.2725470066070557, - "learning_rate": 3.4933119184483065e-06, - "loss": 0.1364, - "step": 604 - }, - { - "epoch": 3.7116564417177913, - "grad_norm": 3.228956460952759, - "learning_rate": 3.4888883421074076e-06, - "loss": 0.177, - "step": 605 - }, - { - "epoch": 3.7177914110429446, - "grad_norm": 3.7648911476135254, - "learning_rate": 3.484461092337434e-06, - "loss": 0.122, - "step": 606 - }, - { - "epoch": 3.7239263803680984, - "grad_norm": 3.5322585105895996, - "learning_rate": 3.4800301855843137e-06, - "loss": 0.2664, - "step": 607 - }, - { - "epoch": 3.7300613496932513, - "grad_norm": 2.951073169708252, - "learning_rate": 3.4755956383075613e-06, - "loss": 0.12, - "step": 608 - }, - { - "epoch": 3.736196319018405, - "grad_norm": 3.0577664375305176, - "learning_rate": 3.471157466980214e-06, - "loss": 0.3926, - "step": 609 - }, - { - "epoch": 3.7423312883435584, - "grad_norm": 4.089846134185791, - "learning_rate": 3.466715688088772e-06, - "loss": 0.6233, - "step": 610 - }, - { - "epoch": 3.7484662576687118, - "grad_norm": 3.081340789794922, - "learning_rate": 3.462270318133136e-06, - "loss": 0.2456, - "step": 611 - }, - { - "epoch": 3.754601226993865, - "grad_norm": 3.034712553024292, - "learning_rate": 3.4578213736265474e-06, - "loss": 0.2683, - "step": 612 - }, - { - "epoch": 3.7607361963190185, - "grad_norm": 3.459815740585327, - "learning_rate": 3.4533688710955255e-06, - "loss": 0.3796, - "step": 613 - }, - { - "epoch": 3.766871165644172, - "grad_norm": 3.523737907409668, - "learning_rate": 3.448912827079805e-06, - "loss": 0.3326, - "step": 614 - }, - { - "epoch": 3.773006134969325, - "grad_norm": 3.333219289779663, - "learning_rate": 3.4444532581322793e-06, - "loss": 0.206, - "step": 615 - }, - { - "epoch": 3.7791411042944785, - "grad_norm": 3.582387685775757, - "learning_rate": 3.4399901808189327e-06, - "loss": 0.244, - "step": 616 - }, - { - "epoch": 3.785276073619632, - "grad_norm": 3.4887266159057617, - "learning_rate": 3.435523611718785e-06, - "loss": 0.1796, - "step": 617 - }, - { - "epoch": 3.791411042944785, - "grad_norm": 4.89408016204834, - "learning_rate": 3.4310535674238242e-06, - "loss": 0.188, - "step": 618 - }, - { - "epoch": 3.7975460122699385, - "grad_norm": 4.338910102844238, - "learning_rate": 3.42658006453895e-06, - "loss": 0.3039, - "step": 619 - }, - { - "epoch": 3.8036809815950923, - "grad_norm": 4.107708930969238, - "learning_rate": 3.4221031196819083e-06, - "loss": 0.3383, - "step": 620 - }, - { - "epoch": 3.809815950920245, - "grad_norm": 3.698777675628662, - "learning_rate": 3.4176227494832305e-06, - "loss": 0.1721, - "step": 621 - }, - { - "epoch": 3.815950920245399, - "grad_norm": 2.6659226417541504, - "learning_rate": 3.413138970586174e-06, - "loss": 0.2211, - "step": 622 - }, - { - "epoch": 3.8220858895705523, - "grad_norm": 3.2398436069488525, - "learning_rate": 3.4086517996466574e-06, - "loss": 0.1871, - "step": 623 - }, - { - "epoch": 3.8282208588957056, - "grad_norm": 4.9128804206848145, - "learning_rate": 3.404161253333199e-06, - "loss": 0.3874, - "step": 624 - }, - { - "epoch": 3.834355828220859, - "grad_norm": 3.508789300918579, - "learning_rate": 3.3996673483268573e-06, - "loss": 0.1739, - "step": 625 - }, - { - "epoch": 3.8404907975460123, - "grad_norm": 3.3016927242279053, - "learning_rate": 3.3951701013211665e-06, - "loss": 0.274, - "step": 626 - }, - { - "epoch": 3.8466257668711656, - "grad_norm": 3.8941333293914795, - "learning_rate": 3.3906695290220736e-06, - "loss": 0.3568, - "step": 627 - }, - { - "epoch": 3.852760736196319, - "grad_norm": 3.512354850769043, - "learning_rate": 3.3861656481478816e-06, - "loss": 0.157, - "step": 628 - }, - { - "epoch": 3.8588957055214723, - "grad_norm": 3.482649326324463, - "learning_rate": 3.3816584754291814e-06, - "loss": 0.1218, - "step": 629 - }, - { - "epoch": 3.8650306748466257, - "grad_norm": 3.1490275859832764, - "learning_rate": 3.377148027608793e-06, - "loss": 0.2234, - "step": 630 - }, - { - "epoch": 3.871165644171779, - "grad_norm": 3.2172653675079346, - "learning_rate": 3.3726343214417023e-06, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 3.8773006134969323, - "grad_norm": 4.167707443237305, - "learning_rate": 3.3681173736949984e-06, - "loss": 0.1384, - "step": 632 - }, - { - "epoch": 3.883435582822086, - "grad_norm": 3.4743919372558594, - "learning_rate": 3.3635972011478134e-06, - "loss": 0.3807, - "step": 633 - }, - { - "epoch": 3.889570552147239, - "grad_norm": 3.6892173290252686, - "learning_rate": 3.3590738205912566e-06, - "loss": 0.194, - "step": 634 - }, - { - "epoch": 3.895705521472393, - "grad_norm": 3.262967824935913, - "learning_rate": 3.354547248828356e-06, - "loss": 0.202, - "step": 635 - }, - { - "epoch": 3.901840490797546, - "grad_norm": 3.8871562480926514, - "learning_rate": 3.3500175026739916e-06, - "loss": 0.2471, - "step": 636 - }, - { - "epoch": 3.9079754601226995, - "grad_norm": 3.5097084045410156, - "learning_rate": 3.3454845989548385e-06, - "loss": 0.1112, - "step": 637 - }, - { - "epoch": 3.914110429447853, - "grad_norm": 4.163944721221924, - "learning_rate": 3.3409485545092995e-06, - "loss": 0.3368, - "step": 638 - }, - { - "epoch": 3.920245398773006, - "grad_norm": 3.6405045986175537, - "learning_rate": 3.336409386187444e-06, - "loss": 0.1863, - "step": 639 - }, - { - "epoch": 3.9263803680981595, - "grad_norm": 3.2477526664733887, - "learning_rate": 3.331867110850946e-06, - "loss": 0.1491, - "step": 640 - }, - { - "epoch": 3.932515337423313, - "grad_norm": 3.933753490447998, - "learning_rate": 3.327321745373021e-06, - "loss": 0.2484, - "step": 641 - }, - { - "epoch": 3.938650306748466, - "grad_norm": 3.2475059032440186, - "learning_rate": 3.322773306638364e-06, - "loss": 0.2126, - "step": 642 - }, - { - "epoch": 3.9447852760736195, - "grad_norm": 2.628467321395874, - "learning_rate": 3.318221811543086e-06, - "loss": 0.1649, - "step": 643 - }, - { - "epoch": 3.950920245398773, - "grad_norm": 3.2612411975860596, - "learning_rate": 3.313667276994651e-06, - "loss": 0.1442, - "step": 644 - }, - { - "epoch": 3.957055214723926, - "grad_norm": 3.8058395385742188, - "learning_rate": 3.309109719911814e-06, - "loss": 0.359, - "step": 645 - }, - { - "epoch": 3.96319018404908, - "grad_norm": 3.3450071811676025, - "learning_rate": 3.304549157224558e-06, - "loss": 0.4042, - "step": 646 - }, - { - "epoch": 3.969325153374233, - "grad_norm": 3.079601287841797, - "learning_rate": 3.299985605874031e-06, - "loss": 0.1699, - "step": 647 - }, - { - "epoch": 3.9754601226993866, - "grad_norm": 3.8963980674743652, - "learning_rate": 3.295419082812483e-06, - "loss": 0.1888, - "step": 648 - }, - { - "epoch": 3.98159509202454, - "grad_norm": 3.307405948638916, - "learning_rate": 3.2908496050032024e-06, - "loss": 0.2824, - "step": 649 - }, - { - "epoch": 3.9877300613496933, - "grad_norm": 3.227478265762329, - "learning_rate": 3.2862771894204544e-06, - "loss": 0.3038, - "step": 650 - }, - { - "epoch": 3.9938650306748467, - "grad_norm": 4.046506881713867, - "learning_rate": 3.2817018530494164e-06, - "loss": 0.3266, - "step": 651 - }, - { - "epoch": 4.0, - "grad_norm": 7.775874614715576, - "learning_rate": 3.277123612886116e-06, - "loss": 0.2998, - "step": 652 - }, - { - "epoch": 4.006134969325154, - "grad_norm": 3.146462917327881, - "learning_rate": 3.272542485937369e-06, - "loss": 0.2764, - "step": 653 - }, - { - "epoch": 4.012269938650307, - "grad_norm": 3.0539863109588623, - "learning_rate": 3.2679584892207118e-06, - "loss": 0.1157, - "step": 654 - }, - { - "epoch": 4.0184049079754605, - "grad_norm": 3.634021520614624, - "learning_rate": 3.263371639764343e-06, - "loss": 0.0707, - "step": 655 - }, - { - "epoch": 4.024539877300613, - "grad_norm": 3.3474650382995605, - "learning_rate": 3.2587819546070596e-06, - "loss": 0.1067, - "step": 656 - }, - { - "epoch": 4.030674846625767, - "grad_norm": 4.409244537353516, - "learning_rate": 3.254189450798189e-06, - "loss": 0.0564, - "step": 657 - }, - { - "epoch": 4.03680981595092, - "grad_norm": 3.0446252822875977, - "learning_rate": 3.2495941453975312e-06, - "loss": 0.0535, - "step": 658 - }, - { - "epoch": 4.042944785276074, - "grad_norm": 4.014753818511963, - "learning_rate": 3.2449960554752935e-06, - "loss": 0.1245, - "step": 659 - }, - { - "epoch": 4.049079754601227, - "grad_norm": 3.188062906265259, - "learning_rate": 3.240395198112026e-06, - "loss": 0.0626, - "step": 660 - }, - { - "epoch": 4.0552147239263805, - "grad_norm": 3.006086826324463, - "learning_rate": 3.2357915903985605e-06, - "loss": 0.1198, - "step": 661 - }, - { - "epoch": 4.061349693251533, - "grad_norm": 2.8865551948547363, - "learning_rate": 3.2311852494359423e-06, - "loss": 0.0454, - "step": 662 - }, - { - "epoch": 4.067484662576687, - "grad_norm": 4.2888007164001465, - "learning_rate": 3.226576192335373e-06, - "loss": 0.2064, - "step": 663 - }, - { - "epoch": 4.07361963190184, - "grad_norm": 3.1414525508880615, - "learning_rate": 3.2219644362181436e-06, - "loss": 0.2183, - "step": 664 - }, - { - "epoch": 4.079754601226994, - "grad_norm": 2.556277275085449, - "learning_rate": 3.21734999821557e-06, - "loss": 0.0516, - "step": 665 - }, - { - "epoch": 4.085889570552148, - "grad_norm": 2.698118209838867, - "learning_rate": 3.2127328954689307e-06, - "loss": 0.0613, - "step": 666 - }, - { - "epoch": 4.0920245398773005, - "grad_norm": 2.869919538497925, - "learning_rate": 3.2081131451294025e-06, - "loss": 0.0583, - "step": 667 - }, - { - "epoch": 4.098159509202454, - "grad_norm": 3.8786919116973877, - "learning_rate": 3.2034907643579988e-06, - "loss": 0.0766, - "step": 668 - }, - { - "epoch": 4.104294478527607, - "grad_norm": 4.224637031555176, - "learning_rate": 3.1988657703255043e-06, - "loss": 0.1099, - "step": 669 - }, - { - "epoch": 4.110429447852761, - "grad_norm": 4.671669006347656, - "learning_rate": 3.194238180212409e-06, - "loss": 0.1663, - "step": 670 - }, - { - "epoch": 4.116564417177914, - "grad_norm": 3.2484257221221924, - "learning_rate": 3.1896080112088477e-06, - "loss": 0.0587, - "step": 671 - }, - { - "epoch": 4.122699386503068, - "grad_norm": 2.4808075428009033, - "learning_rate": 3.184975280514536e-06, - "loss": 0.0579, - "step": 672 - }, - { - "epoch": 4.128834355828221, - "grad_norm": 3.7106919288635254, - "learning_rate": 3.1803400053387044e-06, - "loss": 0.1083, - "step": 673 - }, - { - "epoch": 4.134969325153374, - "grad_norm": 3.008970260620117, - "learning_rate": 3.175702202900036e-06, - "loss": 0.1355, - "step": 674 - }, - { - "epoch": 4.141104294478527, - "grad_norm": 3.2640793323516846, - "learning_rate": 3.1710618904266006e-06, - "loss": 0.092, - "step": 675 - }, - { - "epoch": 4.147239263803681, - "grad_norm": 3.08042049407959, - "learning_rate": 3.166419085155793e-06, - "loss": 0.0563, - "step": 676 - }, - { - "epoch": 4.153374233128835, - "grad_norm": 2.993530511856079, - "learning_rate": 3.1617738043342695e-06, - "loss": 0.1773, - "step": 677 - }, - { - "epoch": 4.159509202453988, - "grad_norm": 2.6218204498291016, - "learning_rate": 3.157126065217879e-06, - "loss": 0.0489, - "step": 678 - }, - { - "epoch": 4.1656441717791415, - "grad_norm": 4.3173723220825195, - "learning_rate": 3.152475885071606e-06, - "loss": 0.1333, - "step": 679 - }, - { - "epoch": 4.171779141104294, - "grad_norm": 3.659149408340454, - "learning_rate": 3.147823281169498e-06, - "loss": 0.1501, - "step": 680 - }, - { - "epoch": 4.177914110429448, - "grad_norm": 3.0953338146209717, - "learning_rate": 3.143168270794612e-06, - "loss": 0.1067, - "step": 681 - }, - { - "epoch": 4.184049079754601, - "grad_norm": 3.5693907737731934, - "learning_rate": 3.1385108712389394e-06, - "loss": 0.2499, - "step": 682 - }, - { - "epoch": 4.190184049079755, - "grad_norm": 3.3022868633270264, - "learning_rate": 3.1338510998033484e-06, - "loss": 0.1748, - "step": 683 - }, - { - "epoch": 4.196319018404908, - "grad_norm": 3.7468113899230957, - "learning_rate": 3.129188973797519e-06, - "loss": 0.201, - "step": 684 - }, - { - "epoch": 4.2024539877300615, - "grad_norm": 2.8381078243255615, - "learning_rate": 3.124524510539875e-06, - "loss": 0.0735, - "step": 685 - }, - { - "epoch": 4.208588957055214, - "grad_norm": 2.84706974029541, - "learning_rate": 3.119857727357527e-06, - "loss": 0.1806, - "step": 686 - }, - { - "epoch": 4.214723926380368, - "grad_norm": 3.8130292892456055, - "learning_rate": 3.1151886415861993e-06, - "loss": 0.1811, - "step": 687 - }, - { - "epoch": 4.220858895705521, - "grad_norm": 3.528895378112793, - "learning_rate": 3.1105172705701708e-06, - "loss": 0.1634, - "step": 688 - }, - { - "epoch": 4.226993865030675, - "grad_norm": 5.028727054595947, - "learning_rate": 3.1058436316622103e-06, - "loss": 0.1625, - "step": 689 - }, - { - "epoch": 4.233128834355828, - "grad_norm": 4.606889247894287, - "learning_rate": 3.1011677422235093e-06, - "loss": 0.1791, - "step": 690 - }, - { - "epoch": 4.2392638036809815, - "grad_norm": 3.3620636463165283, - "learning_rate": 3.0964896196236217e-06, - "loss": 0.2233, - "step": 691 - }, - { - "epoch": 4.245398773006135, - "grad_norm": 3.7845852375030518, - "learning_rate": 3.0918092812403954e-06, - "loss": 0.1142, - "step": 692 - }, - { - "epoch": 4.251533742331288, - "grad_norm": 3.1204118728637695, - "learning_rate": 3.0871267444599098e-06, - "loss": 0.096, - "step": 693 - }, - { - "epoch": 4.257668711656442, - "grad_norm": 3.686067819595337, - "learning_rate": 3.0824420266764093e-06, - "loss": 0.2749, - "step": 694 - }, - { - "epoch": 4.263803680981595, - "grad_norm": 3.1680829524993896, - "learning_rate": 3.077755145292243e-06, - "loss": 0.2504, - "step": 695 - }, - { - "epoch": 4.269938650306749, - "grad_norm": 3.3179469108581543, - "learning_rate": 3.0730661177177957e-06, - "loss": 0.1324, - "step": 696 - }, - { - "epoch": 4.276073619631902, - "grad_norm": 3.1186370849609375, - "learning_rate": 3.0683749613714238e-06, - "loss": 0.0691, - "step": 697 - }, - { - "epoch": 4.282208588957055, - "grad_norm": 3.086834192276001, - "learning_rate": 3.063681693679391e-06, - "loss": 0.1026, - "step": 698 - }, - { - "epoch": 4.288343558282208, - "grad_norm": 4.629584312438965, - "learning_rate": 3.0589863320758063e-06, - "loss": 0.2646, - "step": 699 - }, - { - "epoch": 4.294478527607362, - "grad_norm": 3.9641213417053223, - "learning_rate": 3.0542888940025562e-06, - "loss": 0.1711, - "step": 700 - }, - { - "epoch": 4.300613496932515, - "grad_norm": 3.75014328956604, - "learning_rate": 3.0495893969092395e-06, - "loss": 0.0589, - "step": 701 - }, - { - "epoch": 4.306748466257669, - "grad_norm": 3.603290319442749, - "learning_rate": 3.044887858253105e-06, - "loss": 0.2244, - "step": 702 - }, - { - "epoch": 4.3128834355828225, - "grad_norm": 3.79404616355896, - "learning_rate": 3.040184295498984e-06, - "loss": 0.1506, - "step": 703 - }, - { - "epoch": 4.319018404907975, - "grad_norm": 3.0890021324157715, - "learning_rate": 3.035478726119228e-06, - "loss": 0.2343, - "step": 704 - }, - { - "epoch": 4.325153374233129, - "grad_norm": 3.6688191890716553, - "learning_rate": 3.0307711675936426e-06, - "loss": 0.0518, - "step": 705 - }, - { - "epoch": 4.331288343558282, - "grad_norm": 5.1836700439453125, - "learning_rate": 3.0260616374094208e-06, - "loss": 0.2363, - "step": 706 - }, - { - "epoch": 4.337423312883436, - "grad_norm": 2.7123284339904785, - "learning_rate": 3.0213501530610807e-06, - "loss": 0.0848, - "step": 707 - }, - { - "epoch": 4.343558282208589, - "grad_norm": 3.5661890506744385, - "learning_rate": 3.0166367320504005e-06, - "loss": 0.149, - "step": 708 - }, - { - "epoch": 4.3496932515337425, - "grad_norm": 3.6454737186431885, - "learning_rate": 3.0119213918863515e-06, - "loss": 0.1133, - "step": 709 - }, - { - "epoch": 4.355828220858895, - "grad_norm": 3.7534968852996826, - "learning_rate": 3.0072041500850343e-06, - "loss": 0.1358, - "step": 710 - }, - { - "epoch": 4.361963190184049, - "grad_norm": 3.40387225151062, - "learning_rate": 3.0024850241696128e-06, - "loss": 0.0706, - "step": 711 - }, - { - "epoch": 4.368098159509202, - "grad_norm": 3.250471591949463, - "learning_rate": 2.9977640316702512e-06, - "loss": 0.1977, - "step": 712 - }, - { - "epoch": 4.374233128834356, - "grad_norm": 3.417781352996826, - "learning_rate": 2.993041190124047e-06, - "loss": 0.2622, - "step": 713 - }, - { - "epoch": 4.38036809815951, - "grad_norm": 2.628434181213379, - "learning_rate": 2.9883165170749657e-06, - "loss": 0.1487, - "step": 714 - }, - { - "epoch": 4.386503067484663, - "grad_norm": 3.240264892578125, - "learning_rate": 2.9835900300737763e-06, - "loss": 0.0822, - "step": 715 - }, - { - "epoch": 4.392638036809816, - "grad_norm": 6.575517177581787, - "learning_rate": 2.9788617466779884e-06, - "loss": 0.3668, - "step": 716 - }, - { - "epoch": 4.398773006134969, - "grad_norm": 4.699089050292969, - "learning_rate": 2.974131684451781e-06, - "loss": 0.2432, - "step": 717 - }, - { - "epoch": 4.404907975460123, - "grad_norm": 2.9815752506256104, - "learning_rate": 2.9693998609659443e-06, - "loss": 0.0689, - "step": 718 - }, - { - "epoch": 4.411042944785276, - "grad_norm": 4.192755222320557, - "learning_rate": 2.9646662937978082e-06, - "loss": 0.1897, - "step": 719 - }, - { - "epoch": 4.41717791411043, - "grad_norm": 2.9729068279266357, - "learning_rate": 2.9599310005311824e-06, - "loss": 0.0457, - "step": 720 - }, - { - "epoch": 4.423312883435583, - "grad_norm": 4.234438896179199, - "learning_rate": 2.9551939987562866e-06, - "loss": 0.2307, - "step": 721 - }, - { - "epoch": 4.429447852760736, - "grad_norm": 3.3982434272766113, - "learning_rate": 2.950455306069688e-06, - "loss": 0.0637, - "step": 722 - }, - { - "epoch": 4.435582822085889, - "grad_norm": 4.539764404296875, - "learning_rate": 2.9457149400742357e-06, - "loss": 0.1924, - "step": 723 - }, - { - "epoch": 4.441717791411043, - "grad_norm": 4.039684772491455, - "learning_rate": 2.940972918378993e-06, - "loss": 0.1275, - "step": 724 - }, - { - "epoch": 4.447852760736196, - "grad_norm": 4.340360641479492, - "learning_rate": 2.936229258599174e-06, - "loss": 0.123, - "step": 725 - }, - { - "epoch": 4.45398773006135, - "grad_norm": 2.8720109462738037, - "learning_rate": 2.93148397835608e-06, - "loss": 0.0555, - "step": 726 - }, - { - "epoch": 4.460122699386503, - "grad_norm": 4.227811336517334, - "learning_rate": 2.926737095277029e-06, - "loss": 0.0991, - "step": 727 - }, - { - "epoch": 4.466257668711656, - "grad_norm": 2.8079142570495605, - "learning_rate": 2.921988626995295e-06, - "loss": 0.0628, - "step": 728 - }, - { - "epoch": 4.47239263803681, - "grad_norm": 4.195122241973877, - "learning_rate": 2.9172385911500385e-06, - "loss": 0.2333, - "step": 729 - }, - { - "epoch": 4.478527607361963, - "grad_norm": 3.223794460296631, - "learning_rate": 2.9124870053862447e-06, - "loss": 0.1317, - "step": 730 - }, - { - "epoch": 4.484662576687117, - "grad_norm": 3.5533759593963623, - "learning_rate": 2.907733887354657e-06, - "loss": 0.2285, - "step": 731 - }, - { - "epoch": 4.49079754601227, - "grad_norm": 3.535673141479492, - "learning_rate": 2.9029792547117088e-06, - "loss": 0.096, - "step": 732 - }, - { - "epoch": 4.4969325153374236, - "grad_norm": 4.031703948974609, - "learning_rate": 2.898223125119461e-06, - "loss": 0.1505, - "step": 733 - }, - { - "epoch": 4.5030674846625764, - "grad_norm": 2.823413610458374, - "learning_rate": 2.893465516245534e-06, - "loss": 0.0327, - "step": 734 - }, - { - "epoch": 4.50920245398773, - "grad_norm": 3.516738176345825, - "learning_rate": 2.8887064457630453e-06, - "loss": 0.0743, - "step": 735 - }, - { - "epoch": 4.515337423312883, - "grad_norm": 3.5523500442504883, - "learning_rate": 2.8839459313505407e-06, - "loss": 0.1768, - "step": 736 - }, - { - "epoch": 4.521472392638037, - "grad_norm": 3.2433223724365234, - "learning_rate": 2.879183990691929e-06, - "loss": 0.1598, - "step": 737 - }, - { - "epoch": 4.52760736196319, - "grad_norm": 3.0156848430633545, - "learning_rate": 2.8744206414764185e-06, - "loss": 0.0829, - "step": 738 - }, - { - "epoch": 4.533742331288344, - "grad_norm": 4.359529495239258, - "learning_rate": 2.8696559013984488e-06, - "loss": 0.1169, - "step": 739 - }, - { - "epoch": 4.539877300613497, - "grad_norm": 2.3862433433532715, - "learning_rate": 2.8648897881576274e-06, - "loss": 0.0962, - "step": 740 - }, - { - "epoch": 4.54601226993865, - "grad_norm": 2.7100136280059814, - "learning_rate": 2.8601223194586613e-06, - "loss": 0.1204, - "step": 741 - }, - { - "epoch": 4.552147239263804, - "grad_norm": 3.8116140365600586, - "learning_rate": 2.8553535130112935e-06, - "loss": 0.0685, - "step": 742 - }, - { - "epoch": 4.558282208588957, - "grad_norm": 2.9640142917633057, - "learning_rate": 2.850583386530235e-06, - "loss": 0.0692, - "step": 743 - }, - { - "epoch": 4.564417177914111, - "grad_norm": 3.264592170715332, - "learning_rate": 2.8458119577351035e-06, - "loss": 0.2128, - "step": 744 - }, - { - "epoch": 4.570552147239264, - "grad_norm": 3.230497360229492, - "learning_rate": 2.841039244350351e-06, - "loss": 0.2409, - "step": 745 - }, - { - "epoch": 4.576687116564417, - "grad_norm": 4.41513204574585, - "learning_rate": 2.8362652641052024e-06, - "loss": 0.1878, - "step": 746 - }, - { - "epoch": 4.58282208588957, - "grad_norm": 3.047248601913452, - "learning_rate": 2.83149003473359e-06, - "loss": 0.1303, - "step": 747 - }, - { - "epoch": 4.588957055214724, - "grad_norm": 2.399754047393799, - "learning_rate": 2.8267135739740836e-06, - "loss": 0.0577, - "step": 748 - }, - { - "epoch": 4.595092024539877, - "grad_norm": 4.608038425445557, - "learning_rate": 2.8219358995698307e-06, - "loss": 0.2329, - "step": 749 - }, - { - "epoch": 4.601226993865031, - "grad_norm": 3.537644147872925, - "learning_rate": 2.8171570292684846e-06, - "loss": 0.1329, - "step": 750 - }, - { - "epoch": 4.6073619631901845, - "grad_norm": 2.8099827766418457, - "learning_rate": 2.8123769808221407e-06, - "loss": 0.1512, - "step": 751 - }, - { - "epoch": 4.613496932515337, - "grad_norm": 3.3169758319854736, - "learning_rate": 2.8075957719872724e-06, - "loss": 0.1267, - "step": 752 - }, - { - "epoch": 4.61963190184049, - "grad_norm": 3.578435182571411, - "learning_rate": 2.8028134205246633e-06, - "loss": 0.147, - "step": 753 - }, - { - "epoch": 4.625766871165644, - "grad_norm": 3.544437885284424, - "learning_rate": 2.7980299441993415e-06, - "loss": 0.0947, - "step": 754 - }, - { - "epoch": 4.631901840490798, - "grad_norm": 3.798776388168335, - "learning_rate": 2.793245360780512e-06, - "loss": 0.1498, - "step": 755 - }, - { - "epoch": 4.638036809815951, - "grad_norm": 3.634991407394409, - "learning_rate": 2.788459688041495e-06, - "loss": 0.2504, - "step": 756 - }, - { - "epoch": 4.644171779141105, - "grad_norm": 20.123680114746094, - "learning_rate": 2.783672943759655e-06, - "loss": 0.2091, - "step": 757 - }, - { - "epoch": 4.6503067484662575, - "grad_norm": 3.9357221126556396, - "learning_rate": 2.778885145716339e-06, - "loss": 0.2045, - "step": 758 - }, - { - "epoch": 4.656441717791411, - "grad_norm": 3.3035309314727783, - "learning_rate": 2.7740963116968063e-06, - "loss": 0.1416, - "step": 759 - }, - { - "epoch": 4.662576687116564, - "grad_norm": 3.096985101699829, - "learning_rate": 2.7693064594901646e-06, - "loss": 0.0455, - "step": 760 - }, - { - "epoch": 4.668711656441718, - "grad_norm": 2.9855458736419678, - "learning_rate": 2.7645156068893075e-06, - "loss": 0.1496, - "step": 761 - }, - { - "epoch": 4.674846625766871, - "grad_norm": 3.9140093326568604, - "learning_rate": 2.759723771690839e-06, - "loss": 0.2061, - "step": 762 - }, - { - "epoch": 4.680981595092025, - "grad_norm": 3.590569496154785, - "learning_rate": 2.754930971695019e-06, - "loss": 0.1017, - "step": 763 - }, - { - "epoch": 4.6871165644171775, - "grad_norm": 3.527254581451416, - "learning_rate": 2.750137224705687e-06, - "loss": 0.1979, - "step": 764 - }, - { - "epoch": 4.693251533742331, - "grad_norm": 4.198459148406982, - "learning_rate": 2.745342548530202e-06, - "loss": 0.1667, - "step": 765 - }, - { - "epoch": 4.699386503067485, - "grad_norm": 2.0246167182922363, - "learning_rate": 2.7405469609793746e-06, - "loss": 0.0346, - "step": 766 - }, - { - "epoch": 4.705521472392638, - "grad_norm": 3.2045300006866455, - "learning_rate": 2.7357504798674004e-06, - "loss": 0.0596, - "step": 767 - }, - { - "epoch": 4.711656441717792, - "grad_norm": 2.736985921859741, - "learning_rate": 2.730953123011796e-06, - "loss": 0.0384, - "step": 768 - }, - { - "epoch": 4.717791411042945, - "grad_norm": 3.0621395111083984, - "learning_rate": 2.726154908233328e-06, - "loss": 0.0558, - "step": 769 - }, - { - "epoch": 4.723926380368098, - "grad_norm": 3.2280497550964355, - "learning_rate": 2.721355853355953e-06, - "loss": 0.2272, - "step": 770 - }, - { - "epoch": 4.730061349693251, - "grad_norm": 3.342226028442383, - "learning_rate": 2.716555976206748e-06, - "loss": 0.074, - "step": 771 - }, - { - "epoch": 4.736196319018405, - "grad_norm": 4.328624248504639, - "learning_rate": 2.7117552946158415e-06, - "loss": 0.1034, - "step": 772 - }, - { - "epoch": 4.742331288343558, - "grad_norm": 2.980215311050415, - "learning_rate": 2.706953826416353e-06, - "loss": 0.1199, - "step": 773 - }, - { - "epoch": 4.748466257668712, - "grad_norm": 2.622478485107422, - "learning_rate": 2.702151589444324e-06, - "loss": 0.0467, - "step": 774 - }, - { - "epoch": 4.754601226993865, - "grad_norm": 2.9958693981170654, - "learning_rate": 2.6973486015386507e-06, - "loss": 0.143, - "step": 775 - }, - { - "epoch": 4.7607361963190185, - "grad_norm": 4.548511505126953, - "learning_rate": 2.6925448805410197e-06, - "loss": 0.3594, - "step": 776 - }, - { - "epoch": 4.766871165644172, - "grad_norm": 3.3429481983184814, - "learning_rate": 2.6877404442958393e-06, - "loss": 0.1397, - "step": 777 - }, - { - "epoch": 4.773006134969325, - "grad_norm": 2.5820136070251465, - "learning_rate": 2.682935310650177e-06, - "loss": 0.054, - "step": 778 - }, - { - "epoch": 4.779141104294479, - "grad_norm": 4.047626495361328, - "learning_rate": 2.6781294974536886e-06, - "loss": 0.1284, - "step": 779 - }, - { - "epoch": 4.785276073619632, - "grad_norm": 3.0227510929107666, - "learning_rate": 2.673323022558557e-06, - "loss": 0.1441, - "step": 780 - }, - { - "epoch": 4.791411042944786, - "grad_norm": 4.731313705444336, - "learning_rate": 2.6685159038194202e-06, - "loss": 0.2859, - "step": 781 - }, - { - "epoch": 4.7975460122699385, - "grad_norm": 3.880655288696289, - "learning_rate": 2.6637081590933096e-06, - "loss": 0.1524, - "step": 782 - }, - { - "epoch": 4.803680981595092, - "grad_norm": 2.375474452972412, - "learning_rate": 2.6588998062395803e-06, - "loss": 0.0338, - "step": 783 - }, - { - "epoch": 4.809815950920245, - "grad_norm": 3.3587446212768555, - "learning_rate": 2.6540908631198498e-06, - "loss": 0.0755, - "step": 784 - }, - { - "epoch": 4.815950920245399, - "grad_norm": 2.767686367034912, - "learning_rate": 2.6492813475979243e-06, - "loss": 0.0631, - "step": 785 - }, - { - "epoch": 4.822085889570552, - "grad_norm": 3.88670015335083, - "learning_rate": 2.6444712775397397e-06, - "loss": 0.0853, - "step": 786 - }, - { - "epoch": 4.828220858895706, - "grad_norm": 3.543276309967041, - "learning_rate": 2.639660670813288e-06, - "loss": 0.1895, - "step": 787 - }, - { - "epoch": 4.8343558282208585, - "grad_norm": 3.659323215484619, - "learning_rate": 2.6348495452885598e-06, - "loss": 0.1745, - "step": 788 - }, - { - "epoch": 4.840490797546012, - "grad_norm": 3.0955021381378174, - "learning_rate": 2.630037918837468e-06, - "loss": 0.0846, - "step": 789 - }, - { - "epoch": 4.846625766871165, - "grad_norm": 3.4473249912261963, - "learning_rate": 2.6252258093337892e-06, - "loss": 0.0808, - "step": 790 - }, - { - "epoch": 4.852760736196319, - "grad_norm": 3.937120199203491, - "learning_rate": 2.6204132346530936e-06, - "loss": 0.2054, - "step": 791 - }, - { - "epoch": 4.858895705521473, - "grad_norm": 4.052806854248047, - "learning_rate": 2.6156002126726788e-06, - "loss": 0.1679, - "step": 792 - }, - { - "epoch": 4.865030674846626, - "grad_norm": 2.6694889068603516, - "learning_rate": 2.6107867612715043e-06, - "loss": 0.0534, - "step": 793 - }, - { - "epoch": 4.871165644171779, - "grad_norm": 3.594649076461792, - "learning_rate": 2.6059728983301267e-06, - "loss": 0.0899, - "step": 794 - }, - { - "epoch": 4.877300613496932, - "grad_norm": 2.7796030044555664, - "learning_rate": 2.601158641730629e-06, - "loss": 0.0596, - "step": 795 - }, - { - "epoch": 4.883435582822086, - "grad_norm": 4.618961334228516, - "learning_rate": 2.5963440093565567e-06, - "loss": 0.3858, - "step": 796 - }, - { - "epoch": 4.889570552147239, - "grad_norm": 3.0783939361572266, - "learning_rate": 2.5915290190928518e-06, - "loss": 0.12, - "step": 797 - }, - { - "epoch": 4.895705521472393, - "grad_norm": 4.078456878662109, - "learning_rate": 2.586713688825786e-06, - "loss": 0.1278, - "step": 798 - }, - { - "epoch": 4.901840490797546, - "grad_norm": 2.9439120292663574, - "learning_rate": 2.5818980364428935e-06, - "loss": 0.0847, - "step": 799 - }, - { - "epoch": 4.9079754601226995, - "grad_norm": 5.140681743621826, - "learning_rate": 2.5770820798329055e-06, - "loss": 0.1718, - "step": 800 - }, - { - "epoch": 4.914110429447852, - "grad_norm": 3.450190305709839, - "learning_rate": 2.572265836885682e-06, - "loss": 0.0895, - "step": 801 - }, - { - "epoch": 4.920245398773006, - "grad_norm": 3.1145224571228027, - "learning_rate": 2.567449325492149e-06, - "loss": 0.0652, - "step": 802 - }, - { - "epoch": 4.92638036809816, - "grad_norm": 2.851768732070923, - "learning_rate": 2.5626325635442283e-06, - "loss": 0.0877, - "step": 803 - }, - { - "epoch": 4.932515337423313, - "grad_norm": 3.3392980098724365, - "learning_rate": 2.5578155689347716e-06, - "loss": 0.2028, - "step": 804 - }, - { - "epoch": 4.938650306748467, - "grad_norm": 3.012439250946045, - "learning_rate": 2.5529983595574964e-06, - "loss": 0.031, - "step": 805 - }, - { - "epoch": 4.9447852760736195, - "grad_norm": 2.7732717990875244, - "learning_rate": 2.548180953306918e-06, - "loss": 0.0415, - "step": 806 - }, - { - "epoch": 4.950920245398773, - "grad_norm": 3.0423903465270996, - "learning_rate": 2.5433633680782817e-06, - "loss": 0.1188, - "step": 807 - }, - { - "epoch": 4.957055214723926, - "grad_norm": 5.056387901306152, - "learning_rate": 2.538545621767498e-06, - "loss": 0.1703, - "step": 808 - }, - { - "epoch": 4.96319018404908, - "grad_norm": 4.052585124969482, - "learning_rate": 2.533727732271077e-06, - "loss": 0.1455, - "step": 809 - }, - { - "epoch": 4.969325153374233, - "grad_norm": 3.4507904052734375, - "learning_rate": 2.5289097174860593e-06, - "loss": 0.0617, - "step": 810 - }, - { - "epoch": 4.975460122699387, - "grad_norm": 2.908266305923462, - "learning_rate": 2.524091595309952e-06, - "loss": 0.1173, - "step": 811 - }, - { - "epoch": 4.9815950920245395, - "grad_norm": 2.5857458114624023, - "learning_rate": 2.519273383640661e-06, - "loss": 0.0538, - "step": 812 - }, - { - "epoch": 4.987730061349693, - "grad_norm": 3.3518428802490234, - "learning_rate": 2.5144551003764227e-06, - "loss": 0.211, - "step": 813 - }, - { - "epoch": 4.993865030674847, - "grad_norm": 3.137981653213501, - "learning_rate": 2.509636763415742e-06, - "loss": 0.0944, - "step": 814 - }, - { - "epoch": 5.0, - "grad_norm": 2.8854241371154785, - "learning_rate": 2.5048183906573227e-06, - "loss": 0.098, - "step": 815 - }, - { - "epoch": 5.006134969325154, - "grad_norm": 3.508527994155884, - "learning_rate": 2.5e-06, - "loss": 0.1102, - "step": 816 - }, - { - "epoch": 5.012269938650307, - "grad_norm": 2.448152542114258, - "learning_rate": 2.495181609342678e-06, - "loss": 0.0712, - "step": 817 - }, - { - "epoch": 5.0184049079754605, - "grad_norm": 3.105818748474121, - "learning_rate": 2.4903632365842587e-06, - "loss": 0.0414, - "step": 818 - }, - { - "epoch": 5.024539877300613, - "grad_norm": 3.8048601150512695, - "learning_rate": 2.4855448996235777e-06, - "loss": 0.0894, - "step": 819 - }, - { - "epoch": 5.030674846625767, - "grad_norm": 3.259834051132202, - "learning_rate": 2.48072661635934e-06, - "loss": 0.0796, - "step": 820 - }, - { - "epoch": 5.03680981595092, - "grad_norm": 2.822364568710327, - "learning_rate": 2.475908404690049e-06, - "loss": 0.0349, - "step": 821 - }, - { - "epoch": 5.042944785276074, - "grad_norm": 4.78808069229126, - "learning_rate": 2.4710902825139415e-06, - "loss": 0.2529, - "step": 822 - }, - { - "epoch": 5.049079754601227, - "grad_norm": 3.5420572757720947, - "learning_rate": 2.466272267728924e-06, - "loss": 0.1405, - "step": 823 - }, - { - "epoch": 5.0552147239263805, - "grad_norm": 2.500713348388672, - "learning_rate": 2.461454378232503e-06, - "loss": 0.0408, - "step": 824 - }, - { - "epoch": 5.061349693251533, - "grad_norm": 3.266291618347168, - "learning_rate": 2.4566366319217196e-06, - "loss": 0.0338, - "step": 825 - }, - { - "epoch": 5.067484662576687, - "grad_norm": 4.071012020111084, - "learning_rate": 2.4518190466930837e-06, - "loss": 0.06, - "step": 826 - }, - { - "epoch": 5.07361963190184, - "grad_norm": 4.3747172355651855, - "learning_rate": 2.4470016404425045e-06, - "loss": 0.1184, - "step": 827 - }, - { - "epoch": 5.079754601226994, - "grad_norm": 3.92030668258667, - "learning_rate": 2.4421844310652296e-06, - "loss": 0.1369, - "step": 828 - }, - { - "epoch": 5.085889570552148, - "grad_norm": 3.3482303619384766, - "learning_rate": 2.437367436455773e-06, - "loss": 0.1166, - "step": 829 - }, - { - "epoch": 5.0920245398773005, - "grad_norm": 3.429368019104004, - "learning_rate": 2.4325506745078524e-06, - "loss": 0.1214, - "step": 830 - }, - { - "epoch": 5.098159509202454, - "grad_norm": 3.4915647506713867, - "learning_rate": 2.427734163114319e-06, - "loss": 0.0454, - "step": 831 - }, - { - "epoch": 5.104294478527607, - "grad_norm": 3.1721251010894775, - "learning_rate": 2.4229179201670954e-06, - "loss": 0.0431, - "step": 832 - }, - { - "epoch": 5.110429447852761, - "grad_norm": 2.552578926086426, - "learning_rate": 2.418101963557107e-06, - "loss": 0.0347, - "step": 833 - }, - { - "epoch": 5.116564417177914, - "grad_norm": 3.518169403076172, - "learning_rate": 2.413286311174214e-06, - "loss": 0.1555, - "step": 834 - }, - { - "epoch": 5.122699386503068, - "grad_norm": 2.4452908039093018, - "learning_rate": 2.4084709809071487e-06, - "loss": 0.035, - "step": 835 - }, - { - "epoch": 5.128834355828221, - "grad_norm": 3.5366528034210205, - "learning_rate": 2.403655990643444e-06, - "loss": 0.0798, - "step": 836 - }, - { - "epoch": 5.134969325153374, - "grad_norm": 2.300065040588379, - "learning_rate": 2.398841358269371e-06, - "loss": 0.0178, - "step": 837 - }, - { - "epoch": 5.141104294478527, - "grad_norm": 2.851393699645996, - "learning_rate": 2.3940271016698733e-06, - "loss": 0.0447, - "step": 838 - }, - { - "epoch": 5.147239263803681, - "grad_norm": 4.085958957672119, - "learning_rate": 2.3892132387284956e-06, - "loss": 0.1626, - "step": 839 - }, - { - "epoch": 5.153374233128835, - "grad_norm": 3.4240522384643555, - "learning_rate": 2.384399787327322e-06, - "loss": 0.0914, - "step": 840 - }, - { - "epoch": 5.159509202453988, - "grad_norm": 4.111586570739746, - "learning_rate": 2.3795867653469072e-06, - "loss": 0.0784, - "step": 841 - }, - { - "epoch": 5.1656441717791415, - "grad_norm": 2.3306312561035156, - "learning_rate": 2.374774190666211e-06, - "loss": 0.0216, - "step": 842 - }, - { - "epoch": 5.171779141104294, - "grad_norm": 2.5006275177001953, - "learning_rate": 2.3699620811625327e-06, - "loss": 0.0516, - "step": 843 - }, - { - "epoch": 5.177914110429448, - "grad_norm": 3.1680967807769775, - "learning_rate": 2.365150454711441e-06, - "loss": 0.0517, - "step": 844 - }, - { - "epoch": 5.184049079754601, - "grad_norm": 1.817044734954834, - "learning_rate": 2.3603393291867122e-06, - "loss": 0.0264, - "step": 845 - }, - { - "epoch": 5.190184049079755, - "grad_norm": 4.445211887359619, - "learning_rate": 2.355528722460261e-06, - "loss": 0.1079, - "step": 846 - }, - { - "epoch": 5.196319018404908, - "grad_norm": 2.918304681777954, - "learning_rate": 2.350718652402076e-06, - "loss": 0.0633, - "step": 847 - }, - { - "epoch": 5.2024539877300615, - "grad_norm": 3.6307432651519775, - "learning_rate": 2.345909136880151e-06, - "loss": 0.1013, - "step": 848 - }, - { - "epoch": 5.208588957055214, - "grad_norm": 3.5696842670440674, - "learning_rate": 2.34110019376042e-06, - "loss": 0.0199, - "step": 849 - }, - { - "epoch": 5.214723926380368, - "grad_norm": 2.2214856147766113, - "learning_rate": 2.336291840906691e-06, - "loss": 0.0288, - "step": 850 - }, - { - "epoch": 5.220858895705521, - "grad_norm": 2.5375778675079346, - "learning_rate": 2.3314840961805806e-06, - "loss": 0.0142, - "step": 851 - }, - { - "epoch": 5.226993865030675, - "grad_norm": 3.0093517303466797, - "learning_rate": 2.326676977441444e-06, - "loss": 0.0911, - "step": 852 - }, - { - "epoch": 5.233128834355828, - "grad_norm": 2.7067151069641113, - "learning_rate": 2.3218705025463118e-06, - "loss": 0.0315, - "step": 853 - }, - { - "epoch": 5.2392638036809815, - "grad_norm": 3.1892940998077393, - "learning_rate": 2.3170646893498237e-06, - "loss": 0.1344, - "step": 854 - }, - { - "epoch": 5.245398773006135, - "grad_norm": 2.8909313678741455, - "learning_rate": 2.312259555704161e-06, - "loss": 0.034, - "step": 855 - }, - { - "epoch": 5.251533742331288, - "grad_norm": 5.097650051116943, - "learning_rate": 2.3074551194589816e-06, - "loss": 0.1889, - "step": 856 - }, - { - "epoch": 5.257668711656442, - "grad_norm": 3.8511006832122803, - "learning_rate": 2.3026513984613506e-06, - "loss": 0.0794, - "step": 857 - }, - { - "epoch": 5.263803680981595, - "grad_norm": 2.2874133586883545, - "learning_rate": 2.297848410555677e-06, - "loss": 0.0238, - "step": 858 - }, - { - "epoch": 5.269938650306749, - "grad_norm": 3.504723310470581, - "learning_rate": 2.293046173583648e-06, - "loss": 0.0369, - "step": 859 - }, - { - "epoch": 5.276073619631902, - "grad_norm": 3.2108154296875, - "learning_rate": 2.28824470538416e-06, - "loss": 0.0677, - "step": 860 - }, - { - "epoch": 5.282208588957055, - "grad_norm": 2.2249386310577393, - "learning_rate": 2.2834440237932537e-06, - "loss": 0.0244, - "step": 861 - }, - { - "epoch": 5.288343558282208, - "grad_norm": 3.141784191131592, - "learning_rate": 2.2786441466440474e-06, - "loss": 0.0628, - "step": 862 - }, - { - "epoch": 5.294478527607362, - "grad_norm": 3.5597352981567383, - "learning_rate": 2.2738450917666727e-06, - "loss": 0.0914, - "step": 863 - }, - { - "epoch": 5.300613496932515, - "grad_norm": 2.991966962814331, - "learning_rate": 2.269046876988204e-06, - "loss": 0.0546, - "step": 864 - }, - { - "epoch": 5.306748466257669, - "grad_norm": 3.100776195526123, - "learning_rate": 2.2642495201325995e-06, - "loss": 0.0473, - "step": 865 - }, - { - "epoch": 5.3128834355828225, - "grad_norm": 2.541754722595215, - "learning_rate": 2.259453039020626e-06, - "loss": 0.0613, - "step": 866 - }, - { - "epoch": 5.319018404907975, - "grad_norm": 2.8117194175720215, - "learning_rate": 2.2546574514697985e-06, - "loss": 0.0533, - "step": 867 - }, - { - "epoch": 5.325153374233129, - "grad_norm": 2.5676379203796387, - "learning_rate": 2.249862775294313e-06, - "loss": 0.018, - "step": 868 - }, - { - "epoch": 5.331288343558282, - "grad_norm": 2.5297701358795166, - "learning_rate": 2.245069028304981e-06, - "loss": 0.0246, - "step": 869 - }, - { - "epoch": 5.337423312883436, - "grad_norm": 2.199498176574707, - "learning_rate": 2.240276228309161e-06, - "loss": 0.0551, - "step": 870 - }, - { - "epoch": 5.343558282208589, - "grad_norm": 2.5793557167053223, - "learning_rate": 2.2354843931106933e-06, - "loss": 0.0258, - "step": 871 - }, - { - "epoch": 5.3496932515337425, - "grad_norm": 3.352058172225952, - "learning_rate": 2.230693540509836e-06, - "loss": 0.0228, - "step": 872 - }, - { - "epoch": 5.355828220858895, - "grad_norm": 2.900599956512451, - "learning_rate": 2.225903688303195e-06, - "loss": 0.0586, - "step": 873 - }, - { - "epoch": 5.361963190184049, - "grad_norm": 3.3317267894744873, - "learning_rate": 2.221114854283662e-06, - "loss": 0.0733, - "step": 874 - }, - { - "epoch": 5.368098159509202, - "grad_norm": 2.79304575920105, - "learning_rate": 2.2163270562403453e-06, - "loss": 0.0251, - "step": 875 - }, - { - "epoch": 5.374233128834356, - "grad_norm": 3.8596227169036865, - "learning_rate": 2.211540311958506e-06, - "loss": 0.0957, - "step": 876 - }, - { - "epoch": 5.38036809815951, - "grad_norm": 2.7464358806610107, - "learning_rate": 2.2067546392194888e-06, - "loss": 0.0457, - "step": 877 - }, - { - "epoch": 5.386503067484663, - "grad_norm": 2.3359906673431396, - "learning_rate": 2.2019700558006598e-06, - "loss": 0.0218, - "step": 878 - }, - { - "epoch": 5.392638036809816, - "grad_norm": 3.2412452697753906, - "learning_rate": 2.197186579475337e-06, - "loss": 0.0494, - "step": 879 - }, - { - "epoch": 5.398773006134969, - "grad_norm": 3.930197238922119, - "learning_rate": 2.1924042280127284e-06, - "loss": 0.0803, - "step": 880 - }, - { - "epoch": 5.404907975460123, - "grad_norm": 2.5752930641174316, - "learning_rate": 2.1876230191778598e-06, - "loss": 0.0356, - "step": 881 - }, - { - "epoch": 5.411042944785276, - "grad_norm": 5.507393836975098, - "learning_rate": 2.182842970731516e-06, - "loss": 0.1245, - "step": 882 - }, - { - "epoch": 5.41717791411043, - "grad_norm": 2.416719436645508, - "learning_rate": 2.17806410043017e-06, - "loss": 0.0224, - "step": 883 - }, - { - "epoch": 5.423312883435583, - "grad_norm": 2.500429630279541, - "learning_rate": 2.173286426025917e-06, - "loss": 0.0499, - "step": 884 - }, - { - "epoch": 5.429447852760736, - "grad_norm": 2.8843860626220703, - "learning_rate": 2.168509965266411e-06, - "loss": 0.075, - "step": 885 - }, - { - "epoch": 5.435582822085889, - "grad_norm": 2.3187198638916016, - "learning_rate": 2.1637347358947984e-06, - "loss": 0.065, - "step": 886 - }, - { - "epoch": 5.441717791411043, - "grad_norm": 2.7135889530181885, - "learning_rate": 2.15896075564965e-06, - "loss": 0.0848, - "step": 887 - }, - { - "epoch": 5.447852760736196, - "grad_norm": 1.751846194267273, - "learning_rate": 2.1541880422648978e-06, - "loss": 0.0112, - "step": 888 - }, - { - "epoch": 5.45398773006135, - "grad_norm": 3.113271713256836, - "learning_rate": 2.1494166134697655e-06, - "loss": 0.077, - "step": 889 - }, - { - "epoch": 5.460122699386503, - "grad_norm": 2.711318016052246, - "learning_rate": 2.1446464869887077e-06, - "loss": 0.03, - "step": 890 - }, - { - "epoch": 5.466257668711656, - "grad_norm": 1.8012003898620605, - "learning_rate": 2.13987768054134e-06, - "loss": 0.0141, - "step": 891 - }, - { - "epoch": 5.47239263803681, - "grad_norm": 2.0968120098114014, - "learning_rate": 2.135110211842374e-06, - "loss": 0.0147, - "step": 892 - }, - { - "epoch": 5.478527607361963, - "grad_norm": 3.1689956188201904, - "learning_rate": 2.1303440986015525e-06, - "loss": 0.1123, - "step": 893 - }, - { - "epoch": 5.484662576687117, - "grad_norm": 4.512697219848633, - "learning_rate": 2.1255793585235827e-06, - "loss": 0.0359, - "step": 894 - }, - { - "epoch": 5.49079754601227, - "grad_norm": 3.5739688873291016, - "learning_rate": 2.120816009308071e-06, - "loss": 0.0635, - "step": 895 - }, - { - "epoch": 5.4969325153374236, - "grad_norm": 4.556554317474365, - "learning_rate": 2.1160540686494597e-06, - "loss": 0.1104, - "step": 896 - }, - { - "epoch": 5.5030674846625764, - "grad_norm": 2.2047064304351807, - "learning_rate": 2.1112935542369546e-06, - "loss": 0.0187, - "step": 897 - }, - { - "epoch": 5.50920245398773, - "grad_norm": 3.0289857387542725, - "learning_rate": 2.106534483754466e-06, - "loss": 0.0874, - "step": 898 - }, - { - "epoch": 5.515337423312883, - "grad_norm": 2.7090444564819336, - "learning_rate": 2.1017768748805396e-06, - "loss": 0.0301, - "step": 899 - }, - { - "epoch": 5.521472392638037, - "grad_norm": 3.0662643909454346, - "learning_rate": 2.0970207452882917e-06, - "loss": 0.1192, - "step": 900 - }, - { - "epoch": 5.52760736196319, - "grad_norm": 2.869401454925537, - "learning_rate": 2.0922661126453436e-06, - "loss": 0.0803, - "step": 901 - }, - { - "epoch": 5.533742331288344, - "grad_norm": 2.229947328567505, - "learning_rate": 2.0875129946137557e-06, - "loss": 0.0186, - "step": 902 - }, - { - "epoch": 5.539877300613497, - "grad_norm": 3.3460421562194824, - "learning_rate": 2.0827614088499624e-06, - "loss": 0.0499, - "step": 903 - }, - { - "epoch": 5.54601226993865, - "grad_norm": 1.9324007034301758, - "learning_rate": 2.0780113730047056e-06, - "loss": 0.0322, - "step": 904 - }, - { - "epoch": 5.552147239263804, - "grad_norm": 2.761482000350952, - "learning_rate": 2.0732629047229712e-06, - "loss": 0.0265, - "step": 905 - }, - { - "epoch": 5.558282208588957, - "grad_norm": 2.4173266887664795, - "learning_rate": 2.0685160216439205e-06, - "loss": 0.0229, - "step": 906 - }, - { - "epoch": 5.564417177914111, - "grad_norm": 2.503661632537842, - "learning_rate": 2.0637707414008267e-06, - "loss": 0.0266, - "step": 907 - }, - { - "epoch": 5.570552147239264, - "grad_norm": 2.312236785888672, - "learning_rate": 2.0590270816210077e-06, - "loss": 0.018, - "step": 908 - }, - { - "epoch": 5.576687116564417, - "grad_norm": 2.569575548171997, - "learning_rate": 2.0542850599257647e-06, - "loss": 0.0377, - "step": 909 - }, - { - "epoch": 5.58282208588957, - "grad_norm": 3.520341157913208, - "learning_rate": 2.0495446939303122e-06, - "loss": 0.1224, - "step": 910 - }, - { - "epoch": 5.588957055214724, - "grad_norm": 3.231363296508789, - "learning_rate": 2.044806001243714e-06, - "loss": 0.1457, - "step": 911 - }, - { - "epoch": 5.595092024539877, - "grad_norm": 3.3211300373077393, - "learning_rate": 2.040068999468818e-06, - "loss": 0.0429, - "step": 912 - }, - { - "epoch": 5.601226993865031, - "grad_norm": 3.3712961673736572, - "learning_rate": 2.035333706202192e-06, - "loss": 0.0634, - "step": 913 - }, - { - "epoch": 5.6073619631901845, - "grad_norm": 2.480177402496338, - "learning_rate": 2.0306001390340565e-06, - "loss": 0.0178, - "step": 914 - }, - { - "epoch": 5.613496932515337, - "grad_norm": 2.9777421951293945, - "learning_rate": 2.02586831554822e-06, - "loss": 0.037, - "step": 915 - }, - { - "epoch": 5.61963190184049, - "grad_norm": 2.9129085540771484, - "learning_rate": 2.021138253322012e-06, - "loss": 0.125, - "step": 916 - }, - { - "epoch": 5.625766871165644, - "grad_norm": 4.041767597198486, - "learning_rate": 2.016409969926224e-06, - "loss": 0.1897, - "step": 917 - }, - { - "epoch": 5.631901840490798, - "grad_norm": 4.088902950286865, - "learning_rate": 2.0116834829250355e-06, - "loss": 0.0546, - "step": 918 - }, - { - "epoch": 5.638036809815951, - "grad_norm": 3.8629167079925537, - "learning_rate": 2.0069588098759545e-06, - "loss": 0.0911, - "step": 919 - }, - { - "epoch": 5.644171779141105, - "grad_norm": 2.616830825805664, - "learning_rate": 2.00223596832975e-06, - "loss": 0.0527, - "step": 920 - }, - { - "epoch": 5.6503067484662575, - "grad_norm": 1.9370782375335693, - "learning_rate": 1.9975149758303885e-06, - "loss": 0.0384, - "step": 921 - }, - { - "epoch": 5.656441717791411, - "grad_norm": 3.7839455604553223, - "learning_rate": 1.992795849914967e-06, - "loss": 0.1033, - "step": 922 - }, - { - "epoch": 5.662576687116564, - "grad_norm": 3.870729923248291, - "learning_rate": 1.9880786081136498e-06, - "loss": 0.08, - "step": 923 - }, - { - "epoch": 5.668711656441718, - "grad_norm": 3.4394288063049316, - "learning_rate": 1.9833632679496008e-06, - "loss": 0.0819, - "step": 924 - }, - { - "epoch": 5.674846625766871, - "grad_norm": 3.1659159660339355, - "learning_rate": 1.97864984693892e-06, - "loss": 0.117, - "step": 925 - }, - { - "epoch": 5.680981595092025, - "grad_norm": 2.2375190258026123, - "learning_rate": 1.97393836259058e-06, - "loss": 0.0215, - "step": 926 - }, - { - "epoch": 5.6871165644171775, - "grad_norm": 3.9375314712524414, - "learning_rate": 1.969228832406358e-06, - "loss": 0.1422, - "step": 927 - }, - { - "epoch": 5.693251533742331, - "grad_norm": 3.1969058513641357, - "learning_rate": 1.964521273880772e-06, - "loss": 0.0538, - "step": 928 - }, - { - "epoch": 5.699386503067485, - "grad_norm": 3.5990066528320312, - "learning_rate": 1.9598157045010162e-06, - "loss": 0.114, - "step": 929 - }, - { - "epoch": 5.705521472392638, - "grad_norm": 3.1764235496520996, - "learning_rate": 1.9551121417468955e-06, - "loss": 0.053, - "step": 930 - }, - { - "epoch": 5.711656441717792, - "grad_norm": 4.1162309646606445, - "learning_rate": 1.9504106030907605e-06, - "loss": 0.0866, - "step": 931 - }, - { - "epoch": 5.717791411042945, - "grad_norm": 3.543071985244751, - "learning_rate": 1.945711105997444e-06, - "loss": 0.0908, - "step": 932 - }, - { - "epoch": 5.723926380368098, - "grad_norm": 4.136870384216309, - "learning_rate": 1.941013667924194e-06, - "loss": 0.0612, - "step": 933 - }, - { - "epoch": 5.730061349693251, - "grad_norm": 1.7658357620239258, - "learning_rate": 1.9363183063206097e-06, - "loss": 0.0283, - "step": 934 - }, - { - "epoch": 5.736196319018405, - "grad_norm": 3.9701411724090576, - "learning_rate": 1.931625038628577e-06, - "loss": 0.0948, - "step": 935 - }, - { - "epoch": 5.742331288343558, - "grad_norm": 3.0636157989501953, - "learning_rate": 1.9269338822822047e-06, - "loss": 0.0769, - "step": 936 - }, - { - "epoch": 5.748466257668712, - "grad_norm": 3.3671388626098633, - "learning_rate": 1.9222448547077573e-06, - "loss": 0.098, - "step": 937 - }, - { - "epoch": 5.754601226993865, - "grad_norm": 3.0725975036621094, - "learning_rate": 1.917557973323591e-06, - "loss": 0.0363, - "step": 938 - }, - { - "epoch": 5.7607361963190185, - "grad_norm": 2.5592041015625, - "learning_rate": 1.9128732555400915e-06, - "loss": 0.0205, - "step": 939 - }, - { - "epoch": 5.766871165644172, - "grad_norm": 2.835740804672241, - "learning_rate": 1.9081907187596054e-06, - "loss": 0.0548, - "step": 940 - }, - { - "epoch": 5.773006134969325, - "grad_norm": 3.3596746921539307, - "learning_rate": 1.9035103803763793e-06, - "loss": 0.0454, - "step": 941 - }, - { - "epoch": 5.779141104294479, - "grad_norm": 3.226579427719116, - "learning_rate": 1.8988322577764918e-06, - "loss": 0.0514, - "step": 942 - }, - { - "epoch": 5.785276073619632, - "grad_norm": 3.2044687271118164, - "learning_rate": 1.8941563683377905e-06, - "loss": 0.1361, - "step": 943 - }, - { - "epoch": 5.791411042944786, - "grad_norm": 1.8300527334213257, - "learning_rate": 1.8894827294298296e-06, - "loss": 0.0139, - "step": 944 - }, - { - "epoch": 5.7975460122699385, - "grad_norm": 2.503735303878784, - "learning_rate": 1.884811358413801e-06, - "loss": 0.0311, - "step": 945 - }, - { - "epoch": 5.803680981595092, - "grad_norm": 2.171309471130371, - "learning_rate": 1.8801422726424735e-06, - "loss": 0.0227, - "step": 946 - }, - { - "epoch": 5.809815950920245, - "grad_norm": 1.8116636276245117, - "learning_rate": 1.8754754894601252e-06, - "loss": 0.0157, - "step": 947 - }, - { - "epoch": 5.815950920245399, - "grad_norm": 3.1412570476531982, - "learning_rate": 1.870811026202482e-06, - "loss": 0.1093, - "step": 948 - }, - { - "epoch": 5.822085889570552, - "grad_norm": 2.3962290287017822, - "learning_rate": 1.8661489001966526e-06, - "loss": 0.021, - "step": 949 - }, - { - "epoch": 5.828220858895706, - "grad_norm": 4.169166564941406, - "learning_rate": 1.8614891287610621e-06, - "loss": 0.0663, - "step": 950 - }, - { - "epoch": 5.8343558282208585, - "grad_norm": 3.1181528568267822, - "learning_rate": 1.8568317292053894e-06, - "loss": 0.1008, - "step": 951 - }, - { - "epoch": 5.840490797546012, - "grad_norm": 3.5155029296875, - "learning_rate": 1.8521767188305023e-06, - "loss": 0.0451, - "step": 952 - }, - { - "epoch": 5.846625766871165, - "grad_norm": 2.975693702697754, - "learning_rate": 1.8475241149283957e-06, - "loss": 0.0561, - "step": 953 - }, - { - "epoch": 5.852760736196319, - "grad_norm": 2.1581289768218994, - "learning_rate": 1.842873934782122e-06, - "loss": 0.0265, - "step": 954 - }, - { - "epoch": 5.858895705521473, - "grad_norm": 2.6281228065490723, - "learning_rate": 1.8382261956657318e-06, - "loss": 0.1196, - "step": 955 - }, - { - "epoch": 5.865030674846626, - "grad_norm": 2.9569528102874756, - "learning_rate": 1.8335809148442074e-06, - "loss": 0.1356, - "step": 956 - }, - { - "epoch": 5.871165644171779, - "grad_norm": 2.450949192047119, - "learning_rate": 1.8289381095734005e-06, - "loss": 0.0444, - "step": 957 - }, - { - "epoch": 5.877300613496932, - "grad_norm": 2.1737027168273926, - "learning_rate": 1.8242977970999643e-06, - "loss": 0.0622, - "step": 958 - }, - { - "epoch": 5.883435582822086, - "grad_norm": 3.350647211074829, - "learning_rate": 1.8196599946612956e-06, - "loss": 0.0762, - "step": 959 - }, - { - "epoch": 5.889570552147239, - "grad_norm": 2.5031936168670654, - "learning_rate": 1.8150247194854642e-06, - "loss": 0.0207, - "step": 960 - }, - { - "epoch": 5.895705521472393, - "grad_norm": 3.7103707790374756, - "learning_rate": 1.8103919887911525e-06, - "loss": 0.1122, - "step": 961 - }, - { - "epoch": 5.901840490797546, - "grad_norm": 2.485322952270508, - "learning_rate": 1.8057618197875914e-06, - "loss": 0.0284, - "step": 962 - }, - { - "epoch": 5.9079754601226995, - "grad_norm": 1.903212547302246, - "learning_rate": 1.8011342296744961e-06, - "loss": 0.0239, - "step": 963 - }, - { - "epoch": 5.914110429447852, - "grad_norm": 3.015552520751953, - "learning_rate": 1.796509235642001e-06, - "loss": 0.0425, - "step": 964 - }, - { - "epoch": 5.920245398773006, - "grad_norm": 4.806198596954346, - "learning_rate": 1.7918868548705982e-06, - "loss": 0.2094, - "step": 965 - }, - { - "epoch": 5.92638036809816, - "grad_norm": 2.949596643447876, - "learning_rate": 1.7872671045310703e-06, - "loss": 0.0632, - "step": 966 - }, - { - "epoch": 5.932515337423313, - "grad_norm": 4.153099536895752, - "learning_rate": 1.782650001784431e-06, - "loss": 0.1411, - "step": 967 - }, - { - "epoch": 5.938650306748467, - "grad_norm": 3.4117565155029297, - "learning_rate": 1.7780355637818568e-06, - "loss": 0.0965, - "step": 968 - }, - { - "epoch": 5.9447852760736195, - "grad_norm": 2.533405303955078, - "learning_rate": 1.7734238076646277e-06, - "loss": 0.0568, - "step": 969 - }, - { - "epoch": 5.950920245398773, - "grad_norm": 2.3604726791381836, - "learning_rate": 1.7688147505640581e-06, - "loss": 0.0182, - "step": 970 - }, - { - "epoch": 5.957055214723926, - "grad_norm": 3.807424306869507, - "learning_rate": 1.7642084096014405e-06, - "loss": 0.0547, - "step": 971 - }, - { - "epoch": 5.96319018404908, - "grad_norm": 2.5735342502593994, - "learning_rate": 1.759604801887974e-06, - "loss": 0.0775, - "step": 972 - }, - { - "epoch": 5.969325153374233, - "grad_norm": 2.9217734336853027, - "learning_rate": 1.7550039445247069e-06, - "loss": 0.0541, - "step": 973 - }, - { - "epoch": 5.975460122699387, - "grad_norm": 2.793104410171509, - "learning_rate": 1.7504058546024694e-06, - "loss": 0.0257, - "step": 974 - }, - { - "epoch": 5.9815950920245395, - "grad_norm": 3.5610134601593018, - "learning_rate": 1.7458105492018114e-06, - "loss": 0.0767, - "step": 975 - }, - { - "epoch": 5.987730061349693, - "grad_norm": 2.0738015174865723, - "learning_rate": 1.7412180453929412e-06, - "loss": 0.025, - "step": 976 - }, - { - "epoch": 5.993865030674847, - "grad_norm": 2.1248421669006348, - "learning_rate": 1.736628360235657e-06, - "loss": 0.0183, - "step": 977 - }, - { - "epoch": 6.0, - "grad_norm": 2.901273727416992, - "learning_rate": 1.7320415107792893e-06, - "loss": 0.1369, - "step": 978 - }, - { - "epoch": 6.006134969325154, - "grad_norm": 3.815110683441162, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.1011, - "step": 979 - }, - { - "epoch": 6.012269938650307, - "grad_norm": 2.421208381652832, - "learning_rate": 1.7228763871138845e-06, - "loss": 0.0105, - "step": 980 - }, - { - "epoch": 6.0184049079754605, - "grad_norm": 2.7103846073150635, - "learning_rate": 1.718298146950585e-06, - "loss": 0.0373, - "step": 981 - }, - { - "epoch": 6.024539877300613, - "grad_norm": 1.3751411437988281, - "learning_rate": 1.7137228105795473e-06, - "loss": 0.0072, - "step": 982 - }, - { - "epoch": 6.030674846625767, - "grad_norm": 1.5235071182250977, - "learning_rate": 1.7091503949967987e-06, - "loss": 0.0126, - "step": 983 - }, - { - "epoch": 6.03680981595092, - "grad_norm": 2.0652546882629395, - "learning_rate": 1.7045809171875183e-06, - "loss": 0.0198, - "step": 984 - }, - { - "epoch": 6.042944785276074, - "grad_norm": 2.010207176208496, - "learning_rate": 1.70001439412597e-06, - "loss": 0.0186, - "step": 985 - }, - { - "epoch": 6.049079754601227, - "grad_norm": 2.0444021224975586, - "learning_rate": 1.6954508427754435e-06, - "loss": 0.0197, - "step": 986 - }, - { - "epoch": 6.0552147239263805, - "grad_norm": 2.6540091037750244, - "learning_rate": 1.690890280088187e-06, - "loss": 0.0192, - "step": 987 - }, - { - "epoch": 6.061349693251533, - "grad_norm": 1.6479653120040894, - "learning_rate": 1.6863327230053506e-06, - "loss": 0.0105, - "step": 988 - }, - { - "epoch": 6.067484662576687, - "grad_norm": 2.4434754848480225, - "learning_rate": 1.6817781884569146e-06, - "loss": 0.0275, - "step": 989 - }, - { - "epoch": 6.07361963190184, - "grad_norm": 1.7472137212753296, - "learning_rate": 1.677226693361636e-06, - "loss": 0.0095, - "step": 990 - }, - { - "epoch": 6.079754601226994, - "grad_norm": 2.952821969985962, - "learning_rate": 1.6726782546269793e-06, - "loss": 0.0483, - "step": 991 - }, - { - "epoch": 6.085889570552148, - "grad_norm": 3.123959541320801, - "learning_rate": 1.6681328891490544e-06, - "loss": 0.0815, - "step": 992 - }, - { - "epoch": 6.0920245398773005, - "grad_norm": 2.9924800395965576, - "learning_rate": 1.663590613812556e-06, - "loss": 0.0216, - "step": 993 - }, - { - "epoch": 6.098159509202454, - "grad_norm": 2.417778730392456, - "learning_rate": 1.6590514454907007e-06, - "loss": 0.0243, - "step": 994 - }, - { - "epoch": 6.104294478527607, - "grad_norm": 2.0682942867279053, - "learning_rate": 1.6545154010451613e-06, - "loss": 0.0669, - "step": 995 - }, - { - "epoch": 6.110429447852761, - "grad_norm": 2.9801135063171387, - "learning_rate": 1.6499824973260086e-06, - "loss": 0.0309, - "step": 996 - }, - { - "epoch": 6.116564417177914, - "grad_norm": 1.5753487348556519, - "learning_rate": 1.645452751171645e-06, - "loss": 0.026, - "step": 997 - }, - { - "epoch": 6.122699386503068, - "grad_norm": 2.461124897003174, - "learning_rate": 1.6409261794087438e-06, - "loss": 0.0191, - "step": 998 - }, - { - "epoch": 6.128834355828221, - "grad_norm": 3.839308261871338, - "learning_rate": 1.6364027988521875e-06, - "loss": 0.045, - "step": 999 - }, - { - "epoch": 6.134969325153374, - "grad_norm": 2.9653189182281494, - "learning_rate": 1.6318826263050022e-06, - "loss": 0.0197, - "step": 1000 - }, - { - "epoch": 6.141104294478527, - "grad_norm": 1.1804074048995972, - "learning_rate": 1.6273656785582986e-06, - "loss": 0.0092, - "step": 1001 - }, - { - "epoch": 6.147239263803681, - "grad_norm": 1.9027175903320312, - "learning_rate": 1.6228519723912073e-06, - "loss": 0.0141, - "step": 1002 - }, - { - "epoch": 6.153374233128835, - "grad_norm": 1.831039309501648, - "learning_rate": 1.618341524570819e-06, - "loss": 0.0131, - "step": 1003 - }, - { - "epoch": 6.159509202453988, - "grad_norm": 2.547327756881714, - "learning_rate": 1.613834351852119e-06, - "loss": 0.0686, - "step": 1004 - }, - { - "epoch": 6.1656441717791415, - "grad_norm": 2.746947765350342, - "learning_rate": 1.6093304709779273e-06, - "loss": 0.036, - "step": 1005 - }, - { - "epoch": 6.171779141104294, - "grad_norm": 2.0104732513427734, - "learning_rate": 1.6048298986788345e-06, - "loss": 0.0216, - "step": 1006 - }, - { - "epoch": 6.177914110429448, - "grad_norm": 2.655977725982666, - "learning_rate": 1.6003326516731431e-06, - "loss": 0.024, - "step": 1007 - }, - { - "epoch": 6.184049079754601, - "grad_norm": 2.0733132362365723, - "learning_rate": 1.5958387466668015e-06, - "loss": 0.0133, - "step": 1008 - }, - { - "epoch": 6.190184049079755, - "grad_norm": 2.5398054122924805, - "learning_rate": 1.5913482003533437e-06, - "loss": 0.0331, - "step": 1009 - }, - { - "epoch": 6.196319018404908, - "grad_norm": 1.7983721494674683, - "learning_rate": 1.5868610294138264e-06, - "loss": 0.0111, - "step": 1010 - }, - { - "epoch": 6.2024539877300615, - "grad_norm": 1.7259647846221924, - "learning_rate": 1.58237725051677e-06, - "loss": 0.0112, - "step": 1011 - }, - { - "epoch": 6.208588957055214, - "grad_norm": 1.7722725868225098, - "learning_rate": 1.577896880318093e-06, - "loss": 0.0181, - "step": 1012 - }, - { - "epoch": 6.214723926380368, - "grad_norm": 3.633545398712158, - "learning_rate": 1.5734199354610513e-06, - "loss": 0.0135, - "step": 1013 - }, - { - "epoch": 6.220858895705521, - "grad_norm": 1.8951494693756104, - "learning_rate": 1.5689464325761764e-06, - "loss": 0.0163, - "step": 1014 - }, - { - "epoch": 6.226993865030675, - "grad_norm": 1.637170433998108, - "learning_rate": 1.564476388281216e-06, - "loss": 0.0068, - "step": 1015 - }, - { - "epoch": 6.233128834355828, - "grad_norm": 2.2963850498199463, - "learning_rate": 1.5600098191810682e-06, - "loss": 0.021, - "step": 1016 - }, - { - "epoch": 6.2392638036809815, - "grad_norm": 2.777996063232422, - "learning_rate": 1.555546741867722e-06, - "loss": 0.0349, - "step": 1017 - }, - { - "epoch": 6.245398773006135, - "grad_norm": 2.1580724716186523, - "learning_rate": 1.5510871729201953e-06, - "loss": 0.0626, - "step": 1018 - }, - { - "epoch": 6.251533742331288, - "grad_norm": 1.4158363342285156, - "learning_rate": 1.5466311289044755e-06, - "loss": 0.0082, - "step": 1019 - }, - { - "epoch": 6.257668711656442, - "grad_norm": 3.287564516067505, - "learning_rate": 1.5421786263734524e-06, - "loss": 0.0212, - "step": 1020 - }, - { - "epoch": 6.263803680981595, - "grad_norm": 2.4552016258239746, - "learning_rate": 1.5377296818668638e-06, - "loss": 0.0963, - "step": 1021 - }, - { - "epoch": 6.269938650306749, - "grad_norm": 1.877556562423706, - "learning_rate": 1.5332843119112285e-06, - "loss": 0.011, - "step": 1022 - }, - { - "epoch": 6.276073619631902, - "grad_norm": 3.720372438430786, - "learning_rate": 1.5288425330197864e-06, - "loss": 0.018, - "step": 1023 - }, - { - "epoch": 6.282208588957055, - "grad_norm": 1.9751925468444824, - "learning_rate": 1.5244043616924389e-06, - "loss": 0.0162, - "step": 1024 - }, - { - "epoch": 6.288343558282208, - "grad_norm": 2.5137453079223633, - "learning_rate": 1.5199698144156865e-06, - "loss": 0.0468, - "step": 1025 - }, - { - "epoch": 6.294478527607362, - "grad_norm": 2.111983299255371, - "learning_rate": 1.5155389076625663e-06, - "loss": 0.0064, - "step": 1026 - }, - { - "epoch": 6.300613496932515, - "grad_norm": 2.572223663330078, - "learning_rate": 1.5111116578925924e-06, - "loss": 0.035, - "step": 1027 - }, - { - "epoch": 6.306748466257669, - "grad_norm": 2.7881019115448, - "learning_rate": 1.5066880815516943e-06, - "loss": 0.0197, - "step": 1028 - }, - { - "epoch": 6.3128834355828225, - "grad_norm": 1.2287017107009888, - "learning_rate": 1.5022681950721565e-06, - "loss": 0.0059, - "step": 1029 - }, - { - "epoch": 6.319018404907975, - "grad_norm": 1.764028549194336, - "learning_rate": 1.4978520148725558e-06, - "loss": 0.006, - "step": 1030 - }, - { - "epoch": 6.325153374233129, - "grad_norm": 2.399787664413452, - "learning_rate": 1.4934395573577016e-06, - "loss": 0.0126, - "step": 1031 - }, - { - "epoch": 6.331288343558282, - "grad_norm": 1.9056172370910645, - "learning_rate": 1.4890308389185743e-06, - "loss": 0.0131, - "step": 1032 - }, - { - "epoch": 6.337423312883436, - "grad_norm": 1.7394744157791138, - "learning_rate": 1.484625875932265e-06, - "loss": 0.016, - "step": 1033 - }, - { - "epoch": 6.343558282208589, - "grad_norm": 4.352719306945801, - "learning_rate": 1.480224684761915e-06, - "loss": 0.1059, - "step": 1034 - }, - { - "epoch": 6.3496932515337425, - "grad_norm": 2.148385524749756, - "learning_rate": 1.4758272817566538e-06, - "loss": 0.0312, - "step": 1035 - }, - { - "epoch": 6.355828220858895, - "grad_norm": 2.483872175216675, - "learning_rate": 1.4714336832515386e-06, - "loss": 0.0215, - "step": 1036 - }, - { - "epoch": 6.361963190184049, - "grad_norm": 2.6151270866394043, - "learning_rate": 1.467043905567494e-06, - "loss": 0.0718, - "step": 1037 - }, - { - "epoch": 6.368098159509202, - "grad_norm": 2.554600954055786, - "learning_rate": 1.4626579650112533e-06, - "loss": 0.0166, - "step": 1038 - }, - { - "epoch": 6.374233128834356, - "grad_norm": 3.013974905014038, - "learning_rate": 1.4582758778752926e-06, - "loss": 0.0448, - "step": 1039 - }, - { - "epoch": 6.38036809815951, - "grad_norm": 2.1542789936065674, - "learning_rate": 1.4538976604377781e-06, - "loss": 0.0297, - "step": 1040 - }, - { - "epoch": 6.386503067484663, - "grad_norm": 3.4402377605438232, - "learning_rate": 1.449523328962496e-06, - "loss": 0.0409, - "step": 1041 - }, - { - "epoch": 6.392638036809816, - "grad_norm": 1.6200538873672485, - "learning_rate": 1.4451528996988018e-06, - "loss": 0.0127, - "step": 1042 - }, - { - "epoch": 6.398773006134969, - "grad_norm": 3.081733465194702, - "learning_rate": 1.4407863888815527e-06, - "loss": 0.0788, - "step": 1043 - }, - { - "epoch": 6.404907975460123, - "grad_norm": 1.9813143014907837, - "learning_rate": 1.436423812731051e-06, - "loss": 0.0082, - "step": 1044 - }, - { - "epoch": 6.411042944785276, - "grad_norm": 1.7354048490524292, - "learning_rate": 1.432065187452984e-06, - "loss": 0.0086, - "step": 1045 - }, - { - "epoch": 6.41717791411043, - "grad_norm": 1.8812576532363892, - "learning_rate": 1.4277105292383594e-06, - "loss": 0.04, - "step": 1046 - }, - { - "epoch": 6.423312883435583, - "grad_norm": 1.117837905883789, - "learning_rate": 1.4233598542634519e-06, - "loss": 0.0054, - "step": 1047 - }, - { - "epoch": 6.429447852760736, - "grad_norm": 1.9587867259979248, - "learning_rate": 1.4190131786897388e-06, - "loss": 0.0263, - "step": 1048 - }, - { - "epoch": 6.435582822085889, - "grad_norm": 1.2712376117706299, - "learning_rate": 1.4146705186638388e-06, - "loss": 0.0098, - "step": 1049 - }, - { - "epoch": 6.441717791411043, - "grad_norm": 2.6563849449157715, - "learning_rate": 1.410331890317457e-06, - "loss": 0.0322, - "step": 1050 - }, - { - "epoch": 6.447852760736196, - "grad_norm": 3.136518955230713, - "learning_rate": 1.4059973097673187e-06, - "loss": 0.0729, - "step": 1051 - }, - { - "epoch": 6.45398773006135, - "grad_norm": 1.3937572240829468, - "learning_rate": 1.4016667931151156e-06, - "loss": 0.0094, - "step": 1052 - }, - { - "epoch": 6.460122699386503, - "grad_norm": 1.7218928337097168, - "learning_rate": 1.3973403564474422e-06, - "loss": 0.0078, - "step": 1053 - }, - { - "epoch": 6.466257668711656, - "grad_norm": 2.35612416267395, - "learning_rate": 1.393018015835737e-06, - "loss": 0.0231, - "step": 1054 - }, - { - "epoch": 6.47239263803681, - "grad_norm": 1.96125066280365, - "learning_rate": 1.388699787336224e-06, - "loss": 0.0153, - "step": 1055 - }, - { - "epoch": 6.478527607361963, - "grad_norm": 2.1789233684539795, - "learning_rate": 1.3843856869898486e-06, - "loss": 0.0136, - "step": 1056 - }, - { - "epoch": 6.484662576687117, - "grad_norm": 3.1261701583862305, - "learning_rate": 1.3800757308222263e-06, - "loss": 0.0819, - "step": 1057 - }, - { - "epoch": 6.49079754601227, - "grad_norm": 2.93422794342041, - "learning_rate": 1.3757699348435726e-06, - "loss": 0.0658, - "step": 1058 - }, - { - "epoch": 6.4969325153374236, - "grad_norm": 2.1311776638031006, - "learning_rate": 1.3714683150486534e-06, - "loss": 0.0106, - "step": 1059 - }, - { - "epoch": 6.5030674846625764, - "grad_norm": 1.699877381324768, - "learning_rate": 1.3671708874167211e-06, - "loss": 0.0151, - "step": 1060 - }, - { - "epoch": 6.50920245398773, - "grad_norm": 1.7288825511932373, - "learning_rate": 1.3628776679114516e-06, - "loss": 0.0114, - "step": 1061 - }, - { - "epoch": 6.515337423312883, - "grad_norm": 1.8437966108322144, - "learning_rate": 1.3585886724808934e-06, - "loss": 0.0117, - "step": 1062 - }, - { - "epoch": 6.521472392638037, - "grad_norm": 3.073568344116211, - "learning_rate": 1.3543039170574022e-06, - "loss": 0.0381, - "step": 1063 - }, - { - "epoch": 6.52760736196319, - "grad_norm": 1.6069157123565674, - "learning_rate": 1.350023417557581e-06, - "loss": 0.0072, - "step": 1064 - }, - { - "epoch": 6.533742331288344, - "grad_norm": 2.48502779006958, - "learning_rate": 1.345747189882228e-06, - "loss": 0.0302, - "step": 1065 - }, - { - "epoch": 6.539877300613497, - "grad_norm": 1.6879143714904785, - "learning_rate": 1.3414752499162676e-06, - "loss": 0.0095, - "step": 1066 - }, - { - "epoch": 6.54601226993865, - "grad_norm": 2.2126848697662354, - "learning_rate": 1.3372076135287005e-06, - "loss": 0.067, - "step": 1067 - }, - { - "epoch": 6.552147239263804, - "grad_norm": 2.157269239425659, - "learning_rate": 1.33294429657254e-06, - "loss": 0.0203, - "step": 1068 - }, - { - "epoch": 6.558282208588957, - "grad_norm": 2.725158452987671, - "learning_rate": 1.3286853148847523e-06, - "loss": 0.0217, - "step": 1069 - }, - { - "epoch": 6.564417177914111, - "grad_norm": 2.478426456451416, - "learning_rate": 1.3244306842862007e-06, - "loss": 0.0223, - "step": 1070 - }, - { - "epoch": 6.570552147239264, - "grad_norm": 2.349463939666748, - "learning_rate": 1.3201804205815872e-06, - "loss": 0.027, - "step": 1071 - }, - { - "epoch": 6.576687116564417, - "grad_norm": 2.049593210220337, - "learning_rate": 1.3159345395593876e-06, - "loss": 0.0212, - "step": 1072 - }, - { - "epoch": 6.58282208588957, - "grad_norm": 2.3445141315460205, - "learning_rate": 1.3116930569918024e-06, - "loss": 0.0182, - "step": 1073 - }, - { - "epoch": 6.588957055214724, - "grad_norm": 3.756135940551758, - "learning_rate": 1.3074559886346886e-06, - "loss": 0.1187, - "step": 1074 - }, - { - "epoch": 6.595092024539877, - "grad_norm": 2.4747114181518555, - "learning_rate": 1.3032233502275089e-06, - "loss": 0.0103, - "step": 1075 - }, - { - "epoch": 6.601226993865031, - "grad_norm": 2.0029311180114746, - "learning_rate": 1.2989951574932693e-06, - "loss": 0.0115, - "step": 1076 - }, - { - "epoch": 6.6073619631901845, - "grad_norm": 2.007141351699829, - "learning_rate": 1.2947714261384602e-06, - "loss": 0.0155, - "step": 1077 - }, - { - "epoch": 6.613496932515337, - "grad_norm": 1.5075048208236694, - "learning_rate": 1.2905521718530012e-06, - "loss": 0.0125, - "step": 1078 - }, - { - "epoch": 6.61963190184049, - "grad_norm": 1.9235132932662964, - "learning_rate": 1.2863374103101784e-06, - "loss": 0.0181, - "step": 1079 - }, - { - "epoch": 6.625766871165644, - "grad_norm": 1.7235040664672852, - "learning_rate": 1.2821271571665912e-06, - "loss": 0.0102, - "step": 1080 - }, - { - "epoch": 6.631901840490798, - "grad_norm": 3.503974676132202, - "learning_rate": 1.277921428062091e-06, - "loss": 0.0969, - "step": 1081 - }, - { - "epoch": 6.638036809815951, - "grad_norm": 2.4633288383483887, - "learning_rate": 1.2737202386197222e-06, - "loss": 0.0383, - "step": 1082 - }, - { - "epoch": 6.644171779141105, - "grad_norm": 2.332341432571411, - "learning_rate": 1.2695236044456672e-06, - "loss": 0.0184, - "step": 1083 - }, - { - "epoch": 6.6503067484662575, - "grad_norm": 2.8279805183410645, - "learning_rate": 1.2653315411291867e-06, - "loss": 0.0327, - "step": 1084 - }, - { - "epoch": 6.656441717791411, - "grad_norm": 2.444810628890991, - "learning_rate": 1.2611440642425617e-06, - "loss": 0.0399, - "step": 1085 - }, - { - "epoch": 6.662576687116564, - "grad_norm": 2.9304957389831543, - "learning_rate": 1.2569611893410374e-06, - "loss": 0.0385, - "step": 1086 - }, - { - "epoch": 6.668711656441718, - "grad_norm": 2.1244678497314453, - "learning_rate": 1.2527829319627604e-06, - "loss": 0.0123, - "step": 1087 - }, - { - "epoch": 6.674846625766871, - "grad_norm": 2.129033327102661, - "learning_rate": 1.248609307628729e-06, - "loss": 0.0302, - "step": 1088 - }, - { - "epoch": 6.680981595092025, - "grad_norm": 5.788925647735596, - "learning_rate": 1.2444403318427268e-06, - "loss": 0.0296, - "step": 1089 - }, - { - "epoch": 6.6871165644171775, - "grad_norm": 5.127935886383057, - "learning_rate": 1.2402760200912725e-06, - "loss": 0.1532, - "step": 1090 - }, - { - "epoch": 6.693251533742331, - "grad_norm": 2.2610318660736084, - "learning_rate": 1.2361163878435594e-06, - "loss": 0.0126, - "step": 1091 - }, - { - "epoch": 6.699386503067485, - "grad_norm": 1.7913328409194946, - "learning_rate": 1.2319614505513953e-06, - "loss": 0.0086, - "step": 1092 - }, - { - "epoch": 6.705521472392638, - "grad_norm": 1.5961267948150635, - "learning_rate": 1.227811223649149e-06, - "loss": 0.0041, - "step": 1093 - }, - { - "epoch": 6.711656441717792, - "grad_norm": 1.441754937171936, - "learning_rate": 1.2236657225536938e-06, - "loss": 0.0103, - "step": 1094 - }, - { - "epoch": 6.717791411042945, - "grad_norm": 1.4393174648284912, - "learning_rate": 1.2195249626643432e-06, - "loss": 0.0063, - "step": 1095 - }, - { - "epoch": 6.723926380368098, - "grad_norm": 3.199451208114624, - "learning_rate": 1.2153889593628032e-06, - "loss": 0.0571, - "step": 1096 - }, - { - "epoch": 6.730061349693251, - "grad_norm": 2.1796770095825195, - "learning_rate": 1.211257728013107e-06, - "loss": 0.0269, - "step": 1097 - }, - { - "epoch": 6.736196319018405, - "grad_norm": 3.1798806190490723, - "learning_rate": 1.2071312839615634e-06, - "loss": 0.0396, - "step": 1098 - }, - { - "epoch": 6.742331288343558, - "grad_norm": 3.063633680343628, - "learning_rate": 1.2030096425366985e-06, - "loss": 0.0261, - "step": 1099 - }, - { - "epoch": 6.748466257668712, - "grad_norm": 1.860409140586853, - "learning_rate": 1.1988928190491948e-06, - "loss": 0.013, - "step": 1100 - }, - { - "epoch": 6.754601226993865, - "grad_norm": 1.9303224086761475, - "learning_rate": 1.1947808287918406e-06, - "loss": 0.0113, - "step": 1101 - }, - { - "epoch": 6.7607361963190185, - "grad_norm": 2.1432337760925293, - "learning_rate": 1.19067368703947e-06, - "loss": 0.0195, - "step": 1102 - }, - { - "epoch": 6.766871165644172, - "grad_norm": 1.8998470306396484, - "learning_rate": 1.1865714090489038e-06, - "loss": 0.0105, - "step": 1103 - }, - { - "epoch": 6.773006134969325, - "grad_norm": 2.3260247707366943, - "learning_rate": 1.1824740100588991e-06, - "loss": 0.0554, - "step": 1104 - }, - { - "epoch": 6.779141104294479, - "grad_norm": 1.9272006750106812, - "learning_rate": 1.1783815052900848e-06, - "loss": 0.0118, - "step": 1105 - }, - { - "epoch": 6.785276073619632, - "grad_norm": 3.1646785736083984, - "learning_rate": 1.1742939099449126e-06, - "loss": 0.0901, - "step": 1106 - }, - { - "epoch": 6.791411042944786, - "grad_norm": 3.357422351837158, - "learning_rate": 1.1702112392075966e-06, - "loss": 0.0833, - "step": 1107 - }, - { - "epoch": 6.7975460122699385, - "grad_norm": 1.4302526712417603, - "learning_rate": 1.1661335082440545e-06, - "loss": 0.0078, - "step": 1108 - }, - { - "epoch": 6.803680981595092, - "grad_norm": 1.3046417236328125, - "learning_rate": 1.1620607322018587e-06, - "loss": 0.0092, - "step": 1109 - }, - { - "epoch": 6.809815950920245, - "grad_norm": 2.084237813949585, - "learning_rate": 1.1579929262101712e-06, - "loss": 0.0283, - "step": 1110 - }, - { - "epoch": 6.815950920245399, - "grad_norm": 1.9403250217437744, - "learning_rate": 1.153930105379695e-06, - "loss": 0.0066, - "step": 1111 - }, - { - "epoch": 6.822085889570552, - "grad_norm": 2.282449722290039, - "learning_rate": 1.1498722848026142e-06, - "loss": 0.0402, - "step": 1112 - }, - { - "epoch": 6.828220858895706, - "grad_norm": 1.9357627630233765, - "learning_rate": 1.1458194795525354e-06, - "loss": 0.0101, - "step": 1113 - }, - { - "epoch": 6.8343558282208585, - "grad_norm": 2.0236339569091797, - "learning_rate": 1.1417717046844385e-06, - "loss": 0.0109, - "step": 1114 - }, - { - "epoch": 6.840490797546012, - "grad_norm": 2.386857032775879, - "learning_rate": 1.137728975234615e-06, - "loss": 0.0297, - "step": 1115 - }, - { - "epoch": 6.846625766871165, - "grad_norm": 2.2477970123291016, - "learning_rate": 1.1336913062206157e-06, - "loss": 0.0393, - "step": 1116 - }, - { - "epoch": 6.852760736196319, - "grad_norm": 2.7217776775360107, - "learning_rate": 1.129658712641192e-06, - "loss": 0.0269, - "step": 1117 - }, - { - "epoch": 6.858895705521473, - "grad_norm": 2.6717259883880615, - "learning_rate": 1.125631209476241e-06, - "loss": 0.0708, - "step": 1118 - }, - { - "epoch": 6.865030674846626, - "grad_norm": 2.951939344406128, - "learning_rate": 1.1216088116867524e-06, - "loss": 0.0835, - "step": 1119 - }, - { - "epoch": 6.871165644171779, - "grad_norm": 1.9705166816711426, - "learning_rate": 1.1175915342147486e-06, - "loss": 0.0107, - "step": 1120 - }, - { - "epoch": 6.877300613496932, - "grad_norm": 2.4005937576293945, - "learning_rate": 1.1135793919832336e-06, - "loss": 0.0139, - "step": 1121 - }, - { - "epoch": 6.883435582822086, - "grad_norm": 2.277463674545288, - "learning_rate": 1.1095723998961353e-06, - "loss": 0.0154, - "step": 1122 - }, - { - "epoch": 6.889570552147239, - "grad_norm": 1.5026034116744995, - "learning_rate": 1.1055705728382482e-06, - "loss": 0.0072, - "step": 1123 - }, - { - "epoch": 6.895705521472393, - "grad_norm": 1.9540379047393799, - "learning_rate": 1.1015739256751826e-06, - "loss": 0.0202, - "step": 1124 - }, - { - "epoch": 6.901840490797546, - "grad_norm": 2.3090603351593018, - "learning_rate": 1.0975824732533066e-06, - "loss": 0.0559, - "step": 1125 - }, - { - "epoch": 6.9079754601226995, - "grad_norm": 2.100283622741699, - "learning_rate": 1.09359623039969e-06, - "loss": 0.0385, - "step": 1126 - }, - { - "epoch": 6.914110429447852, - "grad_norm": 2.4120566844940186, - "learning_rate": 1.0896152119220525e-06, - "loss": 0.0535, - "step": 1127 - }, - { - "epoch": 6.920245398773006, - "grad_norm": 2.003495454788208, - "learning_rate": 1.0856394326087045e-06, - "loss": 0.0104, - "step": 1128 - }, - { - "epoch": 6.92638036809816, - "grad_norm": 1.6565535068511963, - "learning_rate": 1.0816689072284962e-06, - "loss": 0.0121, - "step": 1129 - }, - { - "epoch": 6.932515337423313, - "grad_norm": 1.6503472328186035, - "learning_rate": 1.0777036505307616e-06, - "loss": 0.0056, - "step": 1130 - }, - { - "epoch": 6.938650306748467, - "grad_norm": 2.600112199783325, - "learning_rate": 1.0737436772452602e-06, - "loss": 0.0198, - "step": 1131 - }, - { - "epoch": 6.9447852760736195, - "grad_norm": 1.6668883562088013, - "learning_rate": 1.0697890020821292e-06, - "loss": 0.0077, - "step": 1132 - }, - { - "epoch": 6.950920245398773, - "grad_norm": 2.729172706604004, - "learning_rate": 1.0658396397318203e-06, - "loss": 0.0329, - "step": 1133 - }, - { - "epoch": 6.957055214723926, - "grad_norm": 1.5219136476516724, - "learning_rate": 1.061895604865053e-06, - "loss": 0.0113, - "step": 1134 - }, - { - "epoch": 6.96319018404908, - "grad_norm": 3.8395588397979736, - "learning_rate": 1.057956912132757e-06, - "loss": 0.0376, - "step": 1135 - }, - { - "epoch": 6.969325153374233, - "grad_norm": 2.4347221851348877, - "learning_rate": 1.054023576166014e-06, - "loss": 0.0517, - "step": 1136 - }, - { - "epoch": 6.975460122699387, - "grad_norm": 3.079165458679199, - "learning_rate": 1.0500956115760105e-06, - "loss": 0.0373, - "step": 1137 - }, - { - "epoch": 6.9815950920245395, - "grad_norm": 1.9391908645629883, - "learning_rate": 1.0461730329539794e-06, - "loss": 0.019, - "step": 1138 - }, - { - "epoch": 6.987730061349693, - "grad_norm": 1.8693119287490845, - "learning_rate": 1.0422558548711434e-06, - "loss": 0.0073, - "step": 1139 - }, - { - "epoch": 6.993865030674847, - "grad_norm": 3.0920307636260986, - "learning_rate": 1.0383440918786684e-06, - "loss": 0.0099, - "step": 1140 - }, - { - "epoch": 7.0, - "grad_norm": 3.184906244277954, - "learning_rate": 1.0344377585076e-06, - "loss": 0.0218, - "step": 1141 - }, - { - "epoch": 7.006134969325154, - "grad_norm": 0.7609673142433167, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.0024, - "step": 1142 - }, - { - "epoch": 7.012269938650307, - "grad_norm": 1.1493247747421265, - "learning_rate": 1.0266414386529775e-06, - "loss": 0.0059, - "step": 1143 - }, - { - "epoch": 7.0184049079754605, - "grad_norm": 3.534796953201294, - "learning_rate": 1.0227514811304556e-06, - "loss": 0.0843, - "step": 1144 - }, - { - "epoch": 7.024539877300613, - "grad_norm": 1.1876507997512817, - "learning_rate": 1.0188670111513002e-06, - "loss": 0.0098, - "step": 1145 - }, - { - "epoch": 7.030674846625767, - "grad_norm": 1.2825753688812256, - "learning_rate": 1.0149880431451736e-06, - "loss": 0.0042, - "step": 1146 - }, - { - "epoch": 7.03680981595092, - "grad_norm": 0.6842563152313232, - "learning_rate": 1.0111145915213e-06, - "loss": 0.003, - "step": 1147 - }, - { - "epoch": 7.042944785276074, - "grad_norm": 0.6310113072395325, - "learning_rate": 1.0072466706684127e-06, - "loss": 0.0027, - "step": 1148 - }, - { - "epoch": 7.049079754601227, - "grad_norm": 1.484761357307434, - "learning_rate": 1.0033842949546974e-06, - "loss": 0.0105, - "step": 1149 - }, - { - "epoch": 7.0552147239263805, - "grad_norm": 1.9790291786193848, - "learning_rate": 9.995274787277445e-07, - "loss": 0.0233, - "step": 1150 - }, - { - "epoch": 7.061349693251533, - "grad_norm": 1.1398522853851318, - "learning_rate": 9.956762363144892e-07, - "loss": 0.0031, - "step": 1151 - }, - { - "epoch": 7.067484662576687, - "grad_norm": 1.0574359893798828, - "learning_rate": 9.918305820211643e-07, - "loss": 0.0047, - "step": 1152 - }, - { - "epoch": 7.07361963190184, - "grad_norm": 2.463972330093384, - "learning_rate": 9.879905301332439e-07, - "loss": 0.0334, - "step": 1153 - }, - { - "epoch": 7.079754601226994, - "grad_norm": 1.4698575735092163, - "learning_rate": 9.84156094915389e-07, - "loss": 0.0191, - "step": 1154 - }, - { - "epoch": 7.085889570552148, - "grad_norm": 1.2635239362716675, - "learning_rate": 9.803272906113978e-07, - "loss": 0.0045, - "step": 1155 - }, - { - "epoch": 7.0920245398773005, - "grad_norm": 1.7271842956542969, - "learning_rate": 9.765041314441529e-07, - "loss": 0.0042, - "step": 1156 - }, - { - "epoch": 7.098159509202454, - "grad_norm": 1.5738918781280518, - "learning_rate": 9.72686631615563e-07, - "loss": 0.0066, - "step": 1157 - }, - { - "epoch": 7.104294478527607, - "grad_norm": 1.3097981214523315, - "learning_rate": 9.688748053065179e-07, - "loss": 0.0058, - "step": 1158 - }, - { - "epoch": 7.110429447852761, - "grad_norm": 2.076064348220825, - "learning_rate": 9.65068666676828e-07, - "loss": 0.0067, - "step": 1159 - }, - { - "epoch": 7.116564417177914, - "grad_norm": 1.1589064598083496, - "learning_rate": 9.612682298651792e-07, - "loss": 0.0052, - "step": 1160 - }, - { - "epoch": 7.122699386503068, - "grad_norm": 1.6450324058532715, - "learning_rate": 9.574735089890765e-07, - "loss": 0.0035, - "step": 1161 - }, - { - "epoch": 7.128834355828221, - "grad_norm": 1.6968387365341187, - "learning_rate": 9.53684518144789e-07, - "loss": 0.0126, - "step": 1162 - }, - { - "epoch": 7.134969325153374, - "grad_norm": 1.9047832489013672, - "learning_rate": 9.499012714073036e-07, - "loss": 0.0345, - "step": 1163 - }, - { - "epoch": 7.141104294478527, - "grad_norm": 1.7587796449661255, - "learning_rate": 9.461237828302666e-07, - "loss": 0.0144, - "step": 1164 - }, - { - "epoch": 7.147239263803681, - "grad_norm": 1.863775372505188, - "learning_rate": 9.423520664459374e-07, - "loss": 0.0135, - "step": 1165 - }, - { - "epoch": 7.153374233128835, - "grad_norm": 2.6580259799957275, - "learning_rate": 9.385861362651322e-07, - "loss": 0.0138, - "step": 1166 - }, - { - "epoch": 7.159509202453988, - "grad_norm": 2.086371421813965, - "learning_rate": 9.348260062771713e-07, - "loss": 0.0093, - "step": 1167 - }, - { - "epoch": 7.1656441717791415, - "grad_norm": 1.0806611776351929, - "learning_rate": 9.310716904498321e-07, - "loss": 0.003, - "step": 1168 - }, - { - "epoch": 7.171779141104294, - "grad_norm": 1.2487165927886963, - "learning_rate": 9.273232027292933e-07, - "loss": 0.0033, - "step": 1169 - }, - { - "epoch": 7.177914110429448, - "grad_norm": 1.0647703409194946, - "learning_rate": 9.235805570400813e-07, - "loss": 0.0024, - "step": 1170 - }, - { - "epoch": 7.184049079754601, - "grad_norm": 1.6039917469024658, - "learning_rate": 9.198437672850249e-07, - "loss": 0.0118, - "step": 1171 - }, - { - "epoch": 7.190184049079755, - "grad_norm": 2.199977159500122, - "learning_rate": 9.161128473451967e-07, - "loss": 0.0173, - "step": 1172 - }, - { - "epoch": 7.196319018404908, - "grad_norm": 2.51725697517395, - "learning_rate": 9.123878110798662e-07, - "loss": 0.0142, - "step": 1173 - }, - { - "epoch": 7.2024539877300615, - "grad_norm": 1.841742753982544, - "learning_rate": 9.086686723264474e-07, - "loss": 0.012, - "step": 1174 - }, - { - "epoch": 7.208588957055214, - "grad_norm": 1.212876319885254, - "learning_rate": 9.049554449004447e-07, - "loss": 0.0055, - "step": 1175 - }, - { - "epoch": 7.214723926380368, - "grad_norm": 1.3728275299072266, - "learning_rate": 9.012481425954053e-07, - "loss": 0.0043, - "step": 1176 - }, - { - "epoch": 7.220858895705521, - "grad_norm": 2.3055357933044434, - "learning_rate": 8.97546779182866e-07, - "loss": 0.0443, - "step": 1177 - }, - { - "epoch": 7.226993865030675, - "grad_norm": 2.017620801925659, - "learning_rate": 8.938513684123024e-07, - "loss": 0.0082, - "step": 1178 - }, - { - "epoch": 7.233128834355828, - "grad_norm": 1.5641282796859741, - "learning_rate": 8.901619240110781e-07, - "loss": 0.0071, - "step": 1179 - }, - { - "epoch": 7.2392638036809815, - "grad_norm": 1.3781960010528564, - "learning_rate": 8.864784596843917e-07, - "loss": 0.0056, - "step": 1180 - }, - { - "epoch": 7.245398773006135, - "grad_norm": 1.23178231716156, - "learning_rate": 8.828009891152301e-07, - "loss": 0.0076, - "step": 1181 - }, - { - "epoch": 7.251533742331288, - "grad_norm": 2.809582233428955, - "learning_rate": 8.791295259643126e-07, - "loss": 0.0141, - "step": 1182 - }, - { - "epoch": 7.257668711656442, - "grad_norm": 1.6520317792892456, - "learning_rate": 8.754640838700443e-07, - "loss": 0.01, - "step": 1183 - }, - { - "epoch": 7.263803680981595, - "grad_norm": 1.411852478981018, - "learning_rate": 8.718046764484648e-07, - "loss": 0.009, - "step": 1184 - }, - { - "epoch": 7.269938650306749, - "grad_norm": 2.9334425926208496, - "learning_rate": 8.681513172931935e-07, - "loss": 0.0291, - "step": 1185 - }, - { - "epoch": 7.276073619631902, - "grad_norm": 1.4273028373718262, - "learning_rate": 8.64504019975386e-07, - "loss": 0.0064, - "step": 1186 - }, - { - "epoch": 7.282208588957055, - "grad_norm": 1.9486448764801025, - "learning_rate": 8.608627980436765e-07, - "loss": 0.0135, - "step": 1187 - }, - { - "epoch": 7.288343558282208, - "grad_norm": 1.3740493059158325, - "learning_rate": 8.572276650241329e-07, - "loss": 0.0061, - "step": 1188 - }, - { - "epoch": 7.294478527607362, - "grad_norm": 1.3352797031402588, - "learning_rate": 8.535986344202057e-07, - "loss": 0.0051, - "step": 1189 - }, - { - "epoch": 7.300613496932515, - "grad_norm": 1.0336774587631226, - "learning_rate": 8.499757197126732e-07, - "loss": 0.0052, - "step": 1190 - }, - { - "epoch": 7.306748466257669, - "grad_norm": 1.1450837850570679, - "learning_rate": 8.463589343595976e-07, - "loss": 0.0111, - "step": 1191 - }, - { - "epoch": 7.3128834355828225, - "grad_norm": 2.504876136779785, - "learning_rate": 8.427482917962734e-07, - "loss": 0.0279, - "step": 1192 - }, - { - "epoch": 7.319018404907975, - "grad_norm": 1.569841980934143, - "learning_rate": 8.391438054351725e-07, - "loss": 0.0105, - "step": 1193 - }, - { - "epoch": 7.325153374233129, - "grad_norm": 1.218538761138916, - "learning_rate": 8.355454886659026e-07, - "loss": 0.0028, - "step": 1194 - }, - { - "epoch": 7.331288343558282, - "grad_norm": 2.084049940109253, - "learning_rate": 8.319533548551492e-07, - "loss": 0.0102, - "step": 1195 - }, - { - "epoch": 7.337423312883436, - "grad_norm": 2.326167345046997, - "learning_rate": 8.28367417346633e-07, - "loss": 0.0396, - "step": 1196 - }, - { - "epoch": 7.343558282208589, - "grad_norm": 1.2704310417175293, - "learning_rate": 8.247876894610568e-07, - "loss": 0.006, - "step": 1197 - }, - { - "epoch": 7.3496932515337425, - "grad_norm": 1.358012318611145, - "learning_rate": 8.212141844960544e-07, - "loss": 0.0075, - "step": 1198 - }, - { - "epoch": 7.355828220858895, - "grad_norm": 1.5145729780197144, - "learning_rate": 8.17646915726146e-07, - "loss": 0.0042, - "step": 1199 - }, - { - "epoch": 7.361963190184049, - "grad_norm": 1.203041911125183, - "learning_rate": 8.140858964026849e-07, - "loss": 0.0032, - "step": 1200 - }, - { - "epoch": 7.368098159509202, - "grad_norm": 3.031280279159546, - "learning_rate": 8.105311397538085e-07, - "loss": 0.032, - "step": 1201 - }, - { - "epoch": 7.374233128834356, - "grad_norm": 1.416698694229126, - "learning_rate": 8.069826589843929e-07, - "loss": 0.0185, - "step": 1202 - }, - { - "epoch": 7.38036809815951, - "grad_norm": 0.9656457901000977, - "learning_rate": 8.034404672759977e-07, - "loss": 0.0034, - "step": 1203 - }, - { - "epoch": 7.386503067484663, - "grad_norm": 1.7239291667938232, - "learning_rate": 7.99904577786823e-07, - "loss": 0.034, - "step": 1204 - }, - { - "epoch": 7.392638036809816, - "grad_norm": 1.1560636758804321, - "learning_rate": 7.963750036516585e-07, - "loss": 0.005, - "step": 1205 - }, - { - "epoch": 7.398773006134969, - "grad_norm": 1.057456374168396, - "learning_rate": 7.928517579818312e-07, - "loss": 0.0073, - "step": 1206 - }, - { - "epoch": 7.404907975460123, - "grad_norm": 1.4066674709320068, - "learning_rate": 7.893348538651635e-07, - "loss": 0.015, - "step": 1207 - }, - { - "epoch": 7.411042944785276, - "grad_norm": 1.1061445474624634, - "learning_rate": 7.858243043659161e-07, - "loss": 0.004, - "step": 1208 - }, - { - "epoch": 7.41717791411043, - "grad_norm": 0.9575282335281372, - "learning_rate": 7.823201225247496e-07, - "loss": 0.003, - "step": 1209 - }, - { - "epoch": 7.423312883435583, - "grad_norm": 1.3790507316589355, - "learning_rate": 7.788223213586677e-07, - "loss": 0.0096, - "step": 1210 - }, - { - "epoch": 7.429447852760736, - "grad_norm": 1.1366883516311646, - "learning_rate": 7.753309138609705e-07, - "loss": 0.006, - "step": 1211 - }, - { - "epoch": 7.435582822085889, - "grad_norm": 2.2659928798675537, - "learning_rate": 7.71845913001211e-07, - "loss": 0.0074, - "step": 1212 - }, - { - "epoch": 7.441717791411043, - "grad_norm": 1.2541831731796265, - "learning_rate": 7.683673317251392e-07, - "loss": 0.0051, - "step": 1213 - }, - { - "epoch": 7.447852760736196, - "grad_norm": 1.5959513187408447, - "learning_rate": 7.648951829546619e-07, - "loss": 0.0271, - "step": 1214 - }, - { - "epoch": 7.45398773006135, - "grad_norm": 1.368452548980713, - "learning_rate": 7.6142947958779e-07, - "loss": 0.0155, - "step": 1215 - }, - { - "epoch": 7.460122699386503, - "grad_norm": 1.1851825714111328, - "learning_rate": 7.579702344985899e-07, - "loss": 0.0032, - "step": 1216 - }, - { - "epoch": 7.466257668711656, - "grad_norm": 1.419812560081482, - "learning_rate": 7.545174605371403e-07, - "loss": 0.0037, - "step": 1217 - }, - { - "epoch": 7.47239263803681, - "grad_norm": 1.0817372798919678, - "learning_rate": 7.510711705294782e-07, - "loss": 0.0064, - "step": 1218 - }, - { - "epoch": 7.478527607361963, - "grad_norm": 1.0459797382354736, - "learning_rate": 7.476313772775578e-07, - "loss": 0.0055, - "step": 1219 - }, - { - "epoch": 7.484662576687117, - "grad_norm": 1.4481663703918457, - "learning_rate": 7.441980935591986e-07, - "loss": 0.0049, - "step": 1220 - }, - { - "epoch": 7.49079754601227, - "grad_norm": 1.7337101697921753, - "learning_rate": 7.407713321280377e-07, - "loss": 0.0123, - "step": 1221 - }, - { - "epoch": 7.4969325153374236, - "grad_norm": 1.3378303050994873, - "learning_rate": 7.373511057134855e-07, - "loss": 0.0056, - "step": 1222 - }, - { - "epoch": 7.5030674846625764, - "grad_norm": 2.4353835582733154, - "learning_rate": 7.339374270206772e-07, - "loss": 0.0155, - "step": 1223 - }, - { - "epoch": 7.50920245398773, - "grad_norm": 2.2856571674346924, - "learning_rate": 7.305303087304227e-07, - "loss": 0.0303, - "step": 1224 - }, - { - "epoch": 7.515337423312883, - "grad_norm": 1.0627055168151855, - "learning_rate": 7.271297634991651e-07, - "loss": 0.0018, - "step": 1225 - }, - { - "epoch": 7.521472392638037, - "grad_norm": 1.2120238542556763, - "learning_rate": 7.237358039589271e-07, - "loss": 0.0064, - "step": 1226 - }, - { - "epoch": 7.52760736196319, - "grad_norm": 1.1861765384674072, - "learning_rate": 7.203484427172702e-07, - "loss": 0.0025, - "step": 1227 - }, - { - "epoch": 7.533742331288344, - "grad_norm": 1.6700332164764404, - "learning_rate": 7.169676923572447e-07, - "loss": 0.0067, - "step": 1228 - }, - { - "epoch": 7.539877300613497, - "grad_norm": 1.4527982473373413, - "learning_rate": 7.135935654373416e-07, - "loss": 0.0082, - "step": 1229 - }, - { - "epoch": 7.54601226993865, - "grad_norm": 1.1425046920776367, - "learning_rate": 7.102260744914499e-07, - "loss": 0.0042, - "step": 1230 - }, - { - "epoch": 7.552147239263804, - "grad_norm": 2.0762295722961426, - "learning_rate": 7.068652320288081e-07, - "loss": 0.0374, - "step": 1231 - }, - { - "epoch": 7.558282208588957, - "grad_norm": 1.2008321285247803, - "learning_rate": 7.035110505339546e-07, - "loss": 0.0022, - "step": 1232 - }, - { - "epoch": 7.564417177914111, - "grad_norm": 1.262100338935852, - "learning_rate": 7.001635424666878e-07, - "loss": 0.006, - "step": 1233 - }, - { - "epoch": 7.570552147239264, - "grad_norm": 1.8173811435699463, - "learning_rate": 6.968227202620137e-07, - "loss": 0.0137, - "step": 1234 - }, - { - "epoch": 7.576687116564417, - "grad_norm": 1.6977999210357666, - "learning_rate": 6.934885963301033e-07, - "loss": 0.0216, - "step": 1235 - }, - { - "epoch": 7.58282208588957, - "grad_norm": 0.7084318399429321, - "learning_rate": 6.901611830562469e-07, - "loss": 0.0027, - "step": 1236 - }, - { - "epoch": 7.588957055214724, - "grad_norm": 2.0332374572753906, - "learning_rate": 6.868404928008035e-07, - "loss": 0.0391, - "step": 1237 - }, - { - "epoch": 7.595092024539877, - "grad_norm": 1.235734224319458, - "learning_rate": 6.835265378991613e-07, - "loss": 0.0053, - "step": 1238 - }, - { - "epoch": 7.601226993865031, - "grad_norm": 2.687920331954956, - "learning_rate": 6.802193306616858e-07, - "loss": 0.0395, - "step": 1239 - }, - { - "epoch": 7.6073619631901845, - "grad_norm": 1.4211101531982422, - "learning_rate": 6.769188833736781e-07, - "loss": 0.0055, - "step": 1240 - }, - { - "epoch": 7.613496932515337, - "grad_norm": 2.4542644023895264, - "learning_rate": 6.736252082953307e-07, - "loss": 0.0072, - "step": 1241 - }, - { - "epoch": 7.61963190184049, - "grad_norm": 1.2946943044662476, - "learning_rate": 6.703383176616743e-07, - "loss": 0.0046, - "step": 1242 - }, - { - "epoch": 7.625766871165644, - "grad_norm": 3.8073277473449707, - "learning_rate": 6.670582236825421e-07, - "loss": 0.0742, - "step": 1243 - }, - { - "epoch": 7.631901840490798, - "grad_norm": 1.4291348457336426, - "learning_rate": 6.637849385425157e-07, - "loss": 0.0069, - "step": 1244 - }, - { - "epoch": 7.638036809815951, - "grad_norm": 1.1767655611038208, - "learning_rate": 6.605184744008866e-07, - "loss": 0.0031, - "step": 1245 - }, - { - "epoch": 7.644171779141105, - "grad_norm": 1.837077260017395, - "learning_rate": 6.572588433916082e-07, - "loss": 0.0316, - "step": 1246 - }, - { - "epoch": 7.6503067484662575, - "grad_norm": 1.9157041311264038, - "learning_rate": 6.540060576232488e-07, - "loss": 0.0472, - "step": 1247 - }, - { - "epoch": 7.656441717791411, - "grad_norm": 1.7347630262374878, - "learning_rate": 6.507601291789515e-07, - "loss": 0.0059, - "step": 1248 - }, - { - "epoch": 7.662576687116564, - "grad_norm": 0.9757588505744934, - "learning_rate": 6.475210701163828e-07, - "loss": 0.0023, - "step": 1249 - }, - { - "epoch": 7.668711656441718, - "grad_norm": 1.9460281133651733, - "learning_rate": 6.442888924676951e-07, - "loss": 0.0207, - "step": 1250 - }, - { - "epoch": 7.674846625766871, - "grad_norm": 0.7517938613891602, - "learning_rate": 6.410636082394772e-07, - "loss": 0.002, - "step": 1251 - }, - { - "epoch": 7.680981595092025, - "grad_norm": 1.0631566047668457, - "learning_rate": 6.378452294127091e-07, - "loss": 0.0038, - "step": 1252 - }, - { - "epoch": 7.6871165644171775, - "grad_norm": 0.9524463415145874, - "learning_rate": 6.346337679427214e-07, - "loss": 0.0024, - "step": 1253 - }, - { - "epoch": 7.693251533742331, - "grad_norm": 1.3653123378753662, - "learning_rate": 6.314292357591489e-07, - "loss": 0.0027, - "step": 1254 - }, - { - "epoch": 7.699386503067485, - "grad_norm": 1.2446377277374268, - "learning_rate": 6.282316447658837e-07, - "loss": 0.0048, - "step": 1255 - }, - { - "epoch": 7.705521472392638, - "grad_norm": 1.716244101524353, - "learning_rate": 6.250410068410367e-07, - "loss": 0.0064, - "step": 1256 - }, - { - "epoch": 7.711656441717792, - "grad_norm": 1.7151219844818115, - "learning_rate": 6.218573338368869e-07, - "loss": 0.0056, - "step": 1257 - }, - { - "epoch": 7.717791411042945, - "grad_norm": 1.8013248443603516, - "learning_rate": 6.186806375798429e-07, - "loss": 0.0073, - "step": 1258 - }, - { - "epoch": 7.723926380368098, - "grad_norm": 1.051620602607727, - "learning_rate": 6.155109298703968e-07, - "loss": 0.0043, - "step": 1259 - }, - { - "epoch": 7.730061349693251, - "grad_norm": 1.5731337070465088, - "learning_rate": 6.123482224830787e-07, - "loss": 0.0108, - "step": 1260 - }, - { - "epoch": 7.736196319018405, - "grad_norm": 2.232144832611084, - "learning_rate": 6.091925271664156e-07, - "loss": 0.0337, - "step": 1261 - }, - { - "epoch": 7.742331288343558, - "grad_norm": 1.072678565979004, - "learning_rate": 6.060438556428877e-07, - "loss": 0.0019, - "step": 1262 - }, - { - "epoch": 7.748466257668712, - "grad_norm": 2.3631110191345215, - "learning_rate": 6.02902219608881e-07, - "loss": 0.0089, - "step": 1263 - }, - { - "epoch": 7.754601226993865, - "grad_norm": 1.1171438694000244, - "learning_rate": 5.997676307346504e-07, - "loss": 0.0045, - "step": 1264 - }, - { - "epoch": 7.7607361963190185, - "grad_norm": 0.7839979529380798, - "learning_rate": 5.966401006642689e-07, - "loss": 0.0028, - "step": 1265 - }, - { - "epoch": 7.766871165644172, - "grad_norm": 1.5938968658447266, - "learning_rate": 5.93519641015591e-07, - "loss": 0.009, - "step": 1266 - }, - { - "epoch": 7.773006134969325, - "grad_norm": 1.2980104684829712, - "learning_rate": 5.904062633802066e-07, - "loss": 0.0168, - "step": 1267 - }, - { - "epoch": 7.779141104294479, - "grad_norm": 1.177626371383667, - "learning_rate": 5.872999793233952e-07, - "loss": 0.0029, - "step": 1268 - }, - { - "epoch": 7.785276073619632, - "grad_norm": 2.0138931274414062, - "learning_rate": 5.842008003840891e-07, - "loss": 0.015, - "step": 1269 - }, - { - "epoch": 7.791411042944786, - "grad_norm": 1.7204387187957764, - "learning_rate": 5.811087380748245e-07, - "loss": 0.011, - "step": 1270 - }, - { - "epoch": 7.7975460122699385, - "grad_norm": 1.506241798400879, - "learning_rate": 5.780238038817035e-07, - "loss": 0.0057, - "step": 1271 - }, - { - "epoch": 7.803680981595092, - "grad_norm": 2.0950393676757812, - "learning_rate": 5.74946009264348e-07, - "loss": 0.0131, - "step": 1272 - }, - { - "epoch": 7.809815950920245, - "grad_norm": 2.1451432704925537, - "learning_rate": 5.71875365655859e-07, - "loss": 0.0088, - "step": 1273 - }, - { - "epoch": 7.815950920245399, - "grad_norm": 0.9690236449241638, - "learning_rate": 5.688118844627746e-07, - "loss": 0.0033, - "step": 1274 - }, - { - "epoch": 7.822085889570552, - "grad_norm": 1.5690608024597168, - "learning_rate": 5.657555770650241e-07, - "loss": 0.0206, - "step": 1275 - }, - { - "epoch": 7.828220858895706, - "grad_norm": 1.8220988512039185, - "learning_rate": 5.627064548158903e-07, - "loss": 0.0096, - "step": 1276 - }, - { - "epoch": 7.8343558282208585, - "grad_norm": 2.3800559043884277, - "learning_rate": 5.596645290419653e-07, - "loss": 0.008, - "step": 1277 - }, - { - "epoch": 7.840490797546012, - "grad_norm": 0.7775714993476868, - "learning_rate": 5.566298110431068e-07, - "loss": 0.0016, - "step": 1278 - }, - { - "epoch": 7.846625766871165, - "grad_norm": 1.1196876764297485, - "learning_rate": 5.536023120924e-07, - "loss": 0.0033, - "step": 1279 - }, - { - "epoch": 7.852760736196319, - "grad_norm": 1.3722344636917114, - "learning_rate": 5.505820434361108e-07, - "loss": 0.0084, - "step": 1280 - }, - { - "epoch": 7.858895705521473, - "grad_norm": 1.2068676948547363, - "learning_rate": 5.47569016293649e-07, - "loss": 0.0049, - "step": 1281 - }, - { - "epoch": 7.865030674846626, - "grad_norm": 1.096085548400879, - "learning_rate": 5.445632418575239e-07, - "loss": 0.0019, - "step": 1282 - }, - { - "epoch": 7.871165644171779, - "grad_norm": 1.3178106546401978, - "learning_rate": 5.415647312933015e-07, - "loss": 0.0062, - "step": 1283 - }, - { - "epoch": 7.877300613496932, - "grad_norm": 1.2884724140167236, - "learning_rate": 5.385734957395664e-07, - "loss": 0.0081, - "step": 1284 - }, - { - "epoch": 7.883435582822086, - "grad_norm": 0.9866589307785034, - "learning_rate": 5.355895463078789e-07, - "loss": 0.0048, - "step": 1285 - }, - { - "epoch": 7.889570552147239, - "grad_norm": 1.5396437644958496, - "learning_rate": 5.326128940827313e-07, - "loss": 0.0088, - "step": 1286 - }, - { - "epoch": 7.895705521472393, - "grad_norm": 1.1183607578277588, - "learning_rate": 5.296435501215116e-07, - "loss": 0.0043, - "step": 1287 - }, - { - "epoch": 7.901840490797546, - "grad_norm": 1.5337073802947998, - "learning_rate": 5.266815254544572e-07, - "loss": 0.0099, - "step": 1288 - }, - { - "epoch": 7.9079754601226995, - "grad_norm": 1.8188867568969727, - "learning_rate": 5.237268310846183e-07, - "loss": 0.0086, - "step": 1289 - }, - { - "epoch": 7.914110429447852, - "grad_norm": 1.972072720527649, - "learning_rate": 5.207794779878156e-07, - "loss": 0.0442, - "step": 1290 - }, - { - "epoch": 7.920245398773006, - "grad_norm": 1.1226261854171753, - "learning_rate": 5.178394771125969e-07, - "loss": 0.0071, - "step": 1291 - }, - { - "epoch": 7.92638036809816, - "grad_norm": 1.5612869262695312, - "learning_rate": 5.149068393802009e-07, - "loss": 0.0192, - "step": 1292 - }, - { - "epoch": 7.932515337423313, - "grad_norm": 1.1532280445098877, - "learning_rate": 5.119815756845123e-07, - "loss": 0.0032, - "step": 1293 - }, - { - "epoch": 7.938650306748467, - "grad_norm": 1.8807255029678345, - "learning_rate": 5.090636968920252e-07, - "loss": 0.0139, - "step": 1294 - }, - { - "epoch": 7.9447852760736195, - "grad_norm": 1.3027002811431885, - "learning_rate": 5.061532138418013e-07, - "loss": 0.0071, - "step": 1295 - }, - { - "epoch": 7.950920245398773, - "grad_norm": 1.584154486656189, - "learning_rate": 5.032501373454266e-07, - "loss": 0.0056, - "step": 1296 - }, - { - "epoch": 7.957055214723926, - "grad_norm": 1.7631733417510986, - "learning_rate": 5.003544781869762e-07, - "loss": 0.0239, - "step": 1297 - }, - { - "epoch": 7.96319018404908, - "grad_norm": 1.9462637901306152, - "learning_rate": 4.974662471229727e-07, - "loss": 0.0336, - "step": 1298 - }, - { - "epoch": 7.969325153374233, - "grad_norm": 1.9697695970535278, - "learning_rate": 4.945854548823425e-07, - "loss": 0.0049, - "step": 1299 - }, - { - "epoch": 7.975460122699387, - "grad_norm": 1.066036581993103, - "learning_rate": 4.917121121663823e-07, - "loss": 0.0103, - "step": 1300 - }, - { - "epoch": 7.9815950920245395, - "grad_norm": 1.0865890979766846, - "learning_rate": 4.888462296487129e-07, - "loss": 0.0036, - "step": 1301 - }, - { - "epoch": 7.987730061349693, - "grad_norm": 1.7804820537567139, - "learning_rate": 4.859878179752448e-07, - "loss": 0.0119, - "step": 1302 - }, - { - "epoch": 7.993865030674847, - "grad_norm": 2.735875129699707, - "learning_rate": 4.83136887764136e-07, - "loss": 0.0365, - "step": 1303 - }, - { - "epoch": 8.0, - "grad_norm": 1.316243290901184, - "learning_rate": 4.802934496057527e-07, - "loss": 0.0046, - "step": 1304 - }, - { - "epoch": 8.006134969325153, - "grad_norm": 2.192969560623169, - "learning_rate": 4.774575140626317e-07, - "loss": 0.0235, - "step": 1305 - }, - { - "epoch": 8.012269938650308, - "grad_norm": 0.9257994890213013, - "learning_rate": 4.746290916694368e-07, - "loss": 0.0029, - "step": 1306 - }, - { - "epoch": 8.01840490797546, - "grad_norm": 0.6933830380439758, - "learning_rate": 4.71808192932926e-07, - "loss": 0.0019, - "step": 1307 - }, - { - "epoch": 8.024539877300613, - "grad_norm": 0.4838462173938751, - "learning_rate": 4.6899482833190765e-07, - "loss": 0.0024, - "step": 1308 - }, - { - "epoch": 8.030674846625766, - "grad_norm": 1.1725589036941528, - "learning_rate": 4.661890083172019e-07, - "loss": 0.0166, - "step": 1309 - }, - { - "epoch": 8.036809815950921, - "grad_norm": 0.7732264399528503, - "learning_rate": 4.633907433116053e-07, - "loss": 0.0047, - "step": 1310 - }, - { - "epoch": 8.042944785276074, - "grad_norm": 0.6369810700416565, - "learning_rate": 4.6060004370984763e-07, - "loss": 0.0013, - "step": 1311 - }, - { - "epoch": 8.049079754601227, - "grad_norm": 0.6437183618545532, - "learning_rate": 4.5781691987855676e-07, - "loss": 0.0016, - "step": 1312 - }, - { - "epoch": 8.05521472392638, - "grad_norm": 0.40145647525787354, - "learning_rate": 4.5504138215621915e-07, - "loss": 0.0026, - "step": 1313 - }, - { - "epoch": 8.061349693251534, - "grad_norm": 1.1000946760177612, - "learning_rate": 4.5227344085313873e-07, - "loss": 0.002, - "step": 1314 - }, - { - "epoch": 8.067484662576687, - "grad_norm": 1.4580782651901245, - "learning_rate": 4.495131062514038e-07, - "loss": 0.0299, - "step": 1315 - }, - { - "epoch": 8.07361963190184, - "grad_norm": 0.9026187062263489, - "learning_rate": 4.467603886048452e-07, - "loss": 0.003, - "step": 1316 - }, - { - "epoch": 8.079754601226995, - "grad_norm": 1.2969629764556885, - "learning_rate": 4.440152981389972e-07, - "loss": 0.0129, - "step": 1317 - }, - { - "epoch": 8.085889570552148, - "grad_norm": 0.837665319442749, - "learning_rate": 4.412778450510641e-07, - "loss": 0.0086, - "step": 1318 - }, - { - "epoch": 8.0920245398773, - "grad_norm": 0.3426748216152191, - "learning_rate": 4.3854803950987736e-07, - "loss": 0.002, - "step": 1319 - }, - { - "epoch": 8.098159509202453, - "grad_norm": 0.8508721590042114, - "learning_rate": 4.358258916558611e-07, - "loss": 0.0016, - "step": 1320 - }, - { - "epoch": 8.104294478527608, - "grad_norm": 1.2476134300231934, - "learning_rate": 4.331114116009938e-07, - "loss": 0.0156, - "step": 1321 - }, - { - "epoch": 8.110429447852761, - "grad_norm": 1.036689281463623, - "learning_rate": 4.3040460942876896e-07, - "loss": 0.0021, - "step": 1322 - }, - { - "epoch": 8.116564417177914, - "grad_norm": 0.7747099995613098, - "learning_rate": 4.277054951941609e-07, - "loss": 0.0021, - "step": 1323 - }, - { - "epoch": 8.122699386503067, - "grad_norm": 1.2793506383895874, - "learning_rate": 4.250140789235829e-07, - "loss": 0.0036, - "step": 1324 - }, - { - "epoch": 8.128834355828221, - "grad_norm": 1.5389785766601562, - "learning_rate": 4.223303706148549e-07, - "loss": 0.0031, - "step": 1325 - }, - { - "epoch": 8.134969325153374, - "grad_norm": 1.549869179725647, - "learning_rate": 4.196543802371641e-07, - "loss": 0.0102, - "step": 1326 - }, - { - "epoch": 8.141104294478527, - "grad_norm": 0.862311065196991, - "learning_rate": 4.1698611773102525e-07, - "loss": 0.0023, - "step": 1327 - }, - { - "epoch": 8.14723926380368, - "grad_norm": 1.0216046571731567, - "learning_rate": 4.14325593008249e-07, - "loss": 0.0074, - "step": 1328 - }, - { - "epoch": 8.153374233128835, - "grad_norm": 0.8307499289512634, - "learning_rate": 4.1167281595190206e-07, - "loss": 0.0017, - "step": 1329 - }, - { - "epoch": 8.159509202453988, - "grad_norm": 0.5344944596290588, - "learning_rate": 4.090277964162692e-07, - "loss": 0.0013, - "step": 1330 - }, - { - "epoch": 8.16564417177914, - "grad_norm": 0.8608856201171875, - "learning_rate": 4.063905442268201e-07, - "loss": 0.0014, - "step": 1331 - }, - { - "epoch": 8.171779141104295, - "grad_norm": 0.33019620180130005, - "learning_rate": 4.037610691801694e-07, - "loss": 0.0009, - "step": 1332 - }, - { - "epoch": 8.177914110429448, - "grad_norm": 0.6515982747077942, - "learning_rate": 4.011393810440431e-07, - "loss": 0.0022, - "step": 1333 - }, - { - "epoch": 8.184049079754601, - "grad_norm": 0.9144461750984192, - "learning_rate": 3.985254895572413e-07, - "loss": 0.0024, - "step": 1334 - }, - { - "epoch": 8.190184049079754, - "grad_norm": 0.4078105390071869, - "learning_rate": 3.959194044296011e-07, - "loss": 0.0011, - "step": 1335 - }, - { - "epoch": 8.196319018404909, - "grad_norm": 0.7559608817100525, - "learning_rate": 3.9332113534196194e-07, - "loss": 0.0028, - "step": 1336 - }, - { - "epoch": 8.202453987730062, - "grad_norm": 1.3025604486465454, - "learning_rate": 3.907306919461279e-07, - "loss": 0.0228, - "step": 1337 - }, - { - "epoch": 8.208588957055214, - "grad_norm": 0.6984004974365234, - "learning_rate": 3.8814808386483385e-07, - "loss": 0.0027, - "step": 1338 - }, - { - "epoch": 8.214723926380367, - "grad_norm": 1.161498785018921, - "learning_rate": 3.855733206917095e-07, - "loss": 0.0037, - "step": 1339 - }, - { - "epoch": 8.220858895705522, - "grad_norm": 0.5357164740562439, - "learning_rate": 3.8300641199124024e-07, - "loss": 0.0011, - "step": 1340 - }, - { - "epoch": 8.226993865030675, - "grad_norm": 0.8089649677276611, - "learning_rate": 3.80447367298738e-07, - "loss": 0.0008, - "step": 1341 - }, - { - "epoch": 8.233128834355828, - "grad_norm": 0.4289240539073944, - "learning_rate": 3.77896196120299e-07, - "loss": 0.0012, - "step": 1342 - }, - { - "epoch": 8.239263803680982, - "grad_norm": 0.8666973114013672, - "learning_rate": 3.7535290793277364e-07, - "loss": 0.0047, - "step": 1343 - }, - { - "epoch": 8.245398773006135, - "grad_norm": 0.6841573715209961, - "learning_rate": 3.7281751218372965e-07, - "loss": 0.0007, - "step": 1344 - }, - { - "epoch": 8.251533742331288, - "grad_norm": 0.5588045716285706, - "learning_rate": 3.7029001829141457e-07, - "loss": 0.0018, - "step": 1345 - }, - { - "epoch": 8.257668711656441, - "grad_norm": 1.7257133722305298, - "learning_rate": 3.677704356447254e-07, - "loss": 0.0213, - "step": 1346 - }, - { - "epoch": 8.263803680981596, - "grad_norm": 0.2352600246667862, - "learning_rate": 3.6525877360316875e-07, - "loss": 0.0009, - "step": 1347 - }, - { - "epoch": 8.269938650306749, - "grad_norm": 0.9622183442115784, - "learning_rate": 3.627550414968303e-07, - "loss": 0.0132, - "step": 1348 - }, - { - "epoch": 8.276073619631902, - "grad_norm": 0.5367354154586792, - "learning_rate": 3.6025924862633814e-07, - "loss": 0.0006, - "step": 1349 - }, - { - "epoch": 8.282208588957054, - "grad_norm": 1.5134315490722656, - "learning_rate": 3.577714042628272e-07, - "loss": 0.01, - "step": 1350 - }, - { - "epoch": 8.28834355828221, - "grad_norm": 1.5052622556686401, - "learning_rate": 3.5529151764790715e-07, - "loss": 0.0031, - "step": 1351 - }, - { - "epoch": 8.294478527607362, - "grad_norm": 0.8776562809944153, - "learning_rate": 3.5281959799362775e-07, - "loss": 0.0053, - "step": 1352 - }, - { - "epoch": 8.300613496932515, - "grad_norm": 0.7919799089431763, - "learning_rate": 3.503556544824413e-07, - "loss": 0.0021, - "step": 1353 - }, - { - "epoch": 8.30674846625767, - "grad_norm": 0.7141364216804504, - "learning_rate": 3.4789969626717377e-07, - "loss": 0.0019, - "step": 1354 - }, - { - "epoch": 8.312883435582823, - "grad_norm": 1.7783756256103516, - "learning_rate": 3.454517324709858e-07, - "loss": 0.0019, - "step": 1355 - }, - { - "epoch": 8.319018404907975, - "grad_norm": 0.9534929394721985, - "learning_rate": 3.43011772187343e-07, - "loss": 0.0011, - "step": 1356 - }, - { - "epoch": 8.325153374233128, - "grad_norm": 0.4383384585380554, - "learning_rate": 3.405798244799799e-07, - "loss": 0.0006, - "step": 1357 - }, - { - "epoch": 8.331288343558283, - "grad_norm": 0.8582566976547241, - "learning_rate": 3.3815589838286535e-07, - "loss": 0.002, - "step": 1358 - }, - { - "epoch": 8.337423312883436, - "grad_norm": 0.8288223743438721, - "learning_rate": 3.3574000290017174e-07, - "loss": 0.002, - "step": 1359 - }, - { - "epoch": 8.343558282208589, - "grad_norm": 1.2074549198150635, - "learning_rate": 3.3333214700623976e-07, - "loss": 0.0153, - "step": 1360 - }, - { - "epoch": 8.349693251533742, - "grad_norm": 0.5359098315238953, - "learning_rate": 3.3093233964554464e-07, - "loss": 0.0014, - "step": 1361 - }, - { - "epoch": 8.355828220858896, - "grad_norm": 1.6650397777557373, - "learning_rate": 3.2854058973266547e-07, - "loss": 0.0107, - "step": 1362 - }, - { - "epoch": 8.36196319018405, - "grad_norm": 1.1784273386001587, - "learning_rate": 3.261569061522474e-07, - "loss": 0.0197, - "step": 1363 - }, - { - "epoch": 8.368098159509202, - "grad_norm": 0.6566861271858215, - "learning_rate": 3.237812977589738e-07, - "loss": 0.0009, - "step": 1364 - }, - { - "epoch": 8.374233128834355, - "grad_norm": 0.9043551683425903, - "learning_rate": 3.2141377337753105e-07, - "loss": 0.0026, - "step": 1365 - }, - { - "epoch": 8.38036809815951, - "grad_norm": 2.205872058868408, - "learning_rate": 3.190543418025749e-07, - "loss": 0.0533, - "step": 1366 - }, - { - "epoch": 8.386503067484663, - "grad_norm": 0.2918683886528015, - "learning_rate": 3.167030117986994e-07, - "loss": 0.0007, - "step": 1367 - }, - { - "epoch": 8.392638036809815, - "grad_norm": 0.5370535850524902, - "learning_rate": 3.143597921004027e-07, - "loss": 0.001, - "step": 1368 - }, - { - "epoch": 8.39877300613497, - "grad_norm": 1.353083610534668, - "learning_rate": 3.120246914120564e-07, - "loss": 0.002, - "step": 1369 - }, - { - "epoch": 8.404907975460123, - "grad_norm": 0.644607424736023, - "learning_rate": 3.096977184078731e-07, - "loss": 0.0025, - "step": 1370 - }, - { - "epoch": 8.411042944785276, - "grad_norm": 0.7351365089416504, - "learning_rate": 3.0737888173187067e-07, - "loss": 0.0014, - "step": 1371 - }, - { - "epoch": 8.417177914110429, - "grad_norm": 1.161787748336792, - "learning_rate": 3.050681899978464e-07, - "loss": 0.0149, - "step": 1372 - }, - { - "epoch": 8.423312883435583, - "grad_norm": 1.7568200826644897, - "learning_rate": 3.0276565178933847e-07, - "loss": 0.0178, - "step": 1373 - }, - { - "epoch": 8.429447852760736, - "grad_norm": 0.73989337682724, - "learning_rate": 3.004712756595993e-07, - "loss": 0.0053, - "step": 1374 - }, - { - "epoch": 8.43558282208589, - "grad_norm": 1.8425425291061401, - "learning_rate": 2.9818507013156085e-07, - "loss": 0.0013, - "step": 1375 - }, - { - "epoch": 8.441717791411042, - "grad_norm": 0.6374561786651611, - "learning_rate": 2.9590704369780313e-07, - "loss": 0.0039, - "step": 1376 - }, - { - "epoch": 8.447852760736197, - "grad_norm": 0.708151638507843, - "learning_rate": 2.9363720482052436e-07, - "loss": 0.0025, - "step": 1377 - }, - { - "epoch": 8.45398773006135, - "grad_norm": 1.2846306562423706, - "learning_rate": 2.91375561931507e-07, - "loss": 0.0033, - "step": 1378 - }, - { - "epoch": 8.460122699386503, - "grad_norm": 0.347720742225647, - "learning_rate": 2.89122123432089e-07, - "loss": 0.0006, - "step": 1379 - }, - { - "epoch": 8.466257668711656, - "grad_norm": 0.9626922607421875, - "learning_rate": 2.868768976931313e-07, - "loss": 0.001, - "step": 1380 - }, - { - "epoch": 8.47239263803681, - "grad_norm": 0.26909729838371277, - "learning_rate": 2.8463989305498596e-07, - "loss": 0.0008, - "step": 1381 - }, - { - "epoch": 8.478527607361963, - "grad_norm": 0.8750791549682617, - "learning_rate": 2.824111178274669e-07, - "loss": 0.0025, - "step": 1382 - }, - { - "epoch": 8.484662576687116, - "grad_norm": 1.1124992370605469, - "learning_rate": 2.801905802898183e-07, - "loss": 0.0031, - "step": 1383 - }, - { - "epoch": 8.49079754601227, - "grad_norm": 0.4871549904346466, - "learning_rate": 2.779782886906829e-07, - "loss": 0.0013, - "step": 1384 - }, - { - "epoch": 8.496932515337424, - "grad_norm": 0.5207282900810242, - "learning_rate": 2.7577425124807324e-07, - "loss": 0.0013, - "step": 1385 - }, - { - "epoch": 8.503067484662576, - "grad_norm": 1.8369935750961304, - "learning_rate": 2.7357847614933876e-07, - "loss": 0.0031, - "step": 1386 - }, - { - "epoch": 8.50920245398773, - "grad_norm": 0.6390517354011536, - "learning_rate": 2.713909715511384e-07, - "loss": 0.0045, - "step": 1387 - }, - { - "epoch": 8.515337423312884, - "grad_norm": 0.8618245124816895, - "learning_rate": 2.692117455794077e-07, - "loss": 0.0017, - "step": 1388 - }, - { - "epoch": 8.521472392638037, - "grad_norm": 0.8506134152412415, - "learning_rate": 2.6704080632932895e-07, - "loss": 0.0014, - "step": 1389 - }, - { - "epoch": 8.52760736196319, - "grad_norm": 0.42547252774238586, - "learning_rate": 2.6487816186530263e-07, - "loss": 0.002, - "step": 1390 - }, - { - "epoch": 8.533742331288344, - "grad_norm": 0.6425843834877014, - "learning_rate": 2.6272382022091704e-07, - "loss": 0.0028, - "step": 1391 - }, - { - "epoch": 8.539877300613497, - "grad_norm": 0.8287162780761719, - "learning_rate": 2.6057778939891614e-07, - "loss": 0.011, - "step": 1392 - }, - { - "epoch": 8.54601226993865, - "grad_norm": 1.0402963161468506, - "learning_rate": 2.584400773711737e-07, - "loss": 0.0037, - "step": 1393 - }, - { - "epoch": 8.552147239263803, - "grad_norm": 0.9785431623458862, - "learning_rate": 2.5631069207865926e-07, - "loss": 0.0023, - "step": 1394 - }, - { - "epoch": 8.558282208588958, - "grad_norm": 1.2661131620407104, - "learning_rate": 2.541896414314132e-07, - "loss": 0.0053, - "step": 1395 - }, - { - "epoch": 8.56441717791411, - "grad_norm": 0.2662440240383148, - "learning_rate": 2.520769333085141e-07, - "loss": 0.0008, - "step": 1396 - }, - { - "epoch": 8.570552147239264, - "grad_norm": 0.628510594367981, - "learning_rate": 2.4997257555805064e-07, - "loss": 0.001, - "step": 1397 - }, - { - "epoch": 8.576687116564417, - "grad_norm": 1.08578622341156, - "learning_rate": 2.4787657599709276e-07, - "loss": 0.0041, - "step": 1398 - }, - { - "epoch": 8.582822085889571, - "grad_norm": 0.8213603496551514, - "learning_rate": 2.4578894241166135e-07, - "loss": 0.0029, - "step": 1399 - }, - { - "epoch": 8.588957055214724, - "grad_norm": 0.5261257886886597, - "learning_rate": 2.4370968255670093e-07, - "loss": 0.001, - "step": 1400 - }, - { - "epoch": 8.595092024539877, - "grad_norm": 0.18139345943927765, - "learning_rate": 2.4163880415604913e-07, - "loss": 0.0005, - "step": 1401 - }, - { - "epoch": 8.60122699386503, - "grad_norm": 0.8317165970802307, - "learning_rate": 2.395763149024102e-07, - "loss": 0.0034, - "step": 1402 - }, - { - "epoch": 8.607361963190185, - "grad_norm": 1.272074580192566, - "learning_rate": 2.3752222245732454e-07, - "loss": 0.0036, - "step": 1403 - }, - { - "epoch": 8.613496932515337, - "grad_norm": 0.5556488633155823, - "learning_rate": 2.3547653445114032e-07, - "loss": 0.0013, - "step": 1404 - }, - { - "epoch": 8.61963190184049, - "grad_norm": 0.6546408534049988, - "learning_rate": 2.334392584829867e-07, - "loss": 0.0008, - "step": 1405 - }, - { - "epoch": 8.625766871165645, - "grad_norm": 2.021836996078491, - "learning_rate": 2.3141040212074445e-07, - "loss": 0.0198, - "step": 1406 - }, - { - "epoch": 8.631901840490798, - "grad_norm": 0.6017210483551025, - "learning_rate": 2.293899729010171e-07, - "loss": 0.0033, - "step": 1407 - }, - { - "epoch": 8.63803680981595, - "grad_norm": 0.315134733915329, - "learning_rate": 2.2737797832910498e-07, - "loss": 0.0007, - "step": 1408 - }, - { - "epoch": 8.644171779141104, - "grad_norm": 0.7090817093849182, - "learning_rate": 2.2537442587897474e-07, - "loss": 0.0045, - "step": 1409 - }, - { - "epoch": 8.650306748466258, - "grad_norm": 0.26951614022254944, - "learning_rate": 2.2337932299323434e-07, - "loss": 0.001, - "step": 1410 - }, - { - "epoch": 8.656441717791411, - "grad_norm": 0.21670447289943695, - "learning_rate": 2.2139267708310457e-07, - "loss": 0.0005, - "step": 1411 - }, - { - "epoch": 8.662576687116564, - "grad_norm": 1.070379376411438, - "learning_rate": 2.194144955283886e-07, - "loss": 0.0022, - "step": 1412 - }, - { - "epoch": 8.668711656441717, - "grad_norm": 0.7644438147544861, - "learning_rate": 2.1744478567744947e-07, - "loss": 0.0023, - "step": 1413 - }, - { - "epoch": 8.674846625766872, - "grad_norm": 1.053305983543396, - "learning_rate": 2.154835548471798e-07, - "loss": 0.0027, - "step": 1414 - }, - { - "epoch": 8.680981595092025, - "grad_norm": 0.5719135403633118, - "learning_rate": 2.1353081032297356e-07, - "loss": 0.0015, - "step": 1415 - }, - { - "epoch": 8.687116564417177, - "grad_norm": 0.3360785245895386, - "learning_rate": 2.1158655935870325e-07, - "loss": 0.0025, - "step": 1416 - }, - { - "epoch": 8.69325153374233, - "grad_norm": 0.867242693901062, - "learning_rate": 2.0965080917668744e-07, - "loss": 0.002, - "step": 1417 - }, - { - "epoch": 8.699386503067485, - "grad_norm": 1.1389360427856445, - "learning_rate": 2.077235669676689e-07, - "loss": 0.0023, - "step": 1418 - }, - { - "epoch": 8.705521472392638, - "grad_norm": 0.31157732009887695, - "learning_rate": 2.0580483989078525e-07, - "loss": 0.0005, - "step": 1419 - }, - { - "epoch": 8.71165644171779, - "grad_norm": 1.328353762626648, - "learning_rate": 2.0389463507354211e-07, - "loss": 0.0122, - "step": 1420 - }, - { - "epoch": 8.717791411042946, - "grad_norm": 0.13456307351589203, - "learning_rate": 2.0199295961178893e-07, - "loss": 0.0005, - "step": 1421 - }, - { - "epoch": 8.723926380368098, - "grad_norm": 0.7963683605194092, - "learning_rate": 2.000998205696894e-07, - "loss": 0.004, - "step": 1422 - }, - { - "epoch": 8.730061349693251, - "grad_norm": 0.1814875602722168, - "learning_rate": 1.9821522497969813e-07, - "loss": 0.0004, - "step": 1423 - }, - { - "epoch": 8.736196319018404, - "grad_norm": 0.4806751012802124, - "learning_rate": 1.9633917984253294e-07, - "loss": 0.001, - "step": 1424 - }, - { - "epoch": 8.742331288343559, - "grad_norm": 0.6554126143455505, - "learning_rate": 1.944716921271489e-07, - "loss": 0.0019, - "step": 1425 - }, - { - "epoch": 8.748466257668712, - "grad_norm": 0.7839532494544983, - "learning_rate": 1.9261276877071354e-07, - "loss": 0.0055, - "step": 1426 - }, - { - "epoch": 8.754601226993865, - "grad_norm": 1.1153522729873657, - "learning_rate": 1.9076241667857988e-07, - "loss": 0.0048, - "step": 1427 - }, - { - "epoch": 8.76073619631902, - "grad_norm": 1.4735853672027588, - "learning_rate": 1.8892064272426042e-07, - "loss": 0.0079, - "step": 1428 - }, - { - "epoch": 8.766871165644172, - "grad_norm": 0.9770727157592773, - "learning_rate": 1.8708745374940469e-07, - "loss": 0.0013, - "step": 1429 - }, - { - "epoch": 8.773006134969325, - "grad_norm": 1.5710560083389282, - "learning_rate": 1.8526285656376873e-07, - "loss": 0.0046, - "step": 1430 - }, - { - "epoch": 8.779141104294478, - "grad_norm": 0.9026464819908142, - "learning_rate": 1.8344685794519507e-07, - "loss": 0.006, - "step": 1431 - }, - { - "epoch": 8.785276073619633, - "grad_norm": 1.2195831537246704, - "learning_rate": 1.8163946463958276e-07, - "loss": 0.0094, - "step": 1432 - }, - { - "epoch": 8.791411042944786, - "grad_norm": 0.31636637449264526, - "learning_rate": 1.7984068336086652e-07, - "loss": 0.0009, - "step": 1433 - }, - { - "epoch": 8.797546012269938, - "grad_norm": 0.5591960549354553, - "learning_rate": 1.780505207909894e-07, - "loss": 0.0014, - "step": 1434 - }, - { - "epoch": 8.803680981595091, - "grad_norm": 0.5905728340148926, - "learning_rate": 1.7626898357987782e-07, - "loss": 0.0013, - "step": 1435 - }, - { - "epoch": 8.809815950920246, - "grad_norm": 1.0983483791351318, - "learning_rate": 1.744960783454186e-07, - "loss": 0.0024, - "step": 1436 - }, - { - "epoch": 8.815950920245399, - "grad_norm": 0.7398350238800049, - "learning_rate": 1.727318116734328e-07, - "loss": 0.0015, - "step": 1437 - }, - { - "epoch": 8.822085889570552, - "grad_norm": 0.4621620774269104, - "learning_rate": 1.7097619011765127e-07, - "loss": 0.0017, - "step": 1438 - }, - { - "epoch": 8.828220858895705, - "grad_norm": 0.8077200055122375, - "learning_rate": 1.6922922019969145e-07, - "loss": 0.0009, - "step": 1439 - }, - { - "epoch": 8.83435582822086, - "grad_norm": 0.7134829163551331, - "learning_rate": 1.6749090840903233e-07, - "loss": 0.0013, - "step": 1440 - }, - { - "epoch": 8.840490797546012, - "grad_norm": 1.2837457656860352, - "learning_rate": 1.6576126120299046e-07, - "loss": 0.0029, - "step": 1441 - }, - { - "epoch": 8.846625766871165, - "grad_norm": 0.8713163137435913, - "learning_rate": 1.6404028500669633e-07, - "loss": 0.0034, - "step": 1442 - }, - { - "epoch": 8.85276073619632, - "grad_norm": 0.5622571706771851, - "learning_rate": 1.6232798621306918e-07, - "loss": 0.0022, - "step": 1443 - }, - { - "epoch": 8.858895705521473, - "grad_norm": 2.460902214050293, - "learning_rate": 1.606243711827951e-07, - "loss": 0.0329, - "step": 1444 - }, - { - "epoch": 8.865030674846626, - "grad_norm": 1.5952033996582031, - "learning_rate": 1.5892944624430334e-07, - "loss": 0.0092, - "step": 1445 - }, - { - "epoch": 8.871165644171779, - "grad_norm": 0.16087445616722107, - "learning_rate": 1.5724321769374023e-07, - "loss": 0.0005, - "step": 1446 - }, - { - "epoch": 8.877300613496933, - "grad_norm": 0.33085283637046814, - "learning_rate": 1.5556569179494857e-07, - "loss": 0.0005, - "step": 1447 - }, - { - "epoch": 8.883435582822086, - "grad_norm": 0.15866753458976746, - "learning_rate": 1.538968747794431e-07, - "loss": 0.0004, - "step": 1448 - }, - { - "epoch": 8.889570552147239, - "grad_norm": 1.0744353532791138, - "learning_rate": 1.5223677284638805e-07, - "loss": 0.0046, - "step": 1449 - }, - { - "epoch": 8.895705521472392, - "grad_norm": 0.8372928500175476, - "learning_rate": 1.5058539216257356e-07, - "loss": 0.0048, - "step": 1450 - }, - { - "epoch": 8.901840490797547, - "grad_norm": 1.0015332698822021, - "learning_rate": 1.4894273886239208e-07, - "loss": 0.0027, - "step": 1451 - }, - { - "epoch": 8.9079754601227, - "grad_norm": 1.1478570699691772, - "learning_rate": 1.473088190478178e-07, - "loss": 0.0134, - "step": 1452 - }, - { - "epoch": 8.914110429447852, - "grad_norm": 0.8685131669044495, - "learning_rate": 1.4568363878838087e-07, - "loss": 0.0024, - "step": 1453 - }, - { - "epoch": 8.920245398773005, - "grad_norm": 0.46051493287086487, - "learning_rate": 1.4406720412114828e-07, - "loss": 0.0019, - "step": 1454 - }, - { - "epoch": 8.92638036809816, - "grad_norm": 0.75945645570755, - "learning_rate": 1.4245952105069905e-07, - "loss": 0.0015, - "step": 1455 - }, - { - "epoch": 8.932515337423313, - "grad_norm": 1.2880934476852417, - "learning_rate": 1.4086059554910186e-07, - "loss": 0.0045, - "step": 1456 - }, - { - "epoch": 8.938650306748466, - "grad_norm": 0.2242523580789566, - "learning_rate": 1.3927043355589476e-07, - "loss": 0.0011, - "step": 1457 - }, - { - "epoch": 8.94478527607362, - "grad_norm": 1.0341970920562744, - "learning_rate": 1.3768904097806153e-07, - "loss": 0.0019, - "step": 1458 - }, - { - "epoch": 8.950920245398773, - "grad_norm": 0.8955618739128113, - "learning_rate": 1.361164236900092e-07, - "loss": 0.0027, - "step": 1459 - }, - { - "epoch": 8.957055214723926, - "grad_norm": 1.3581833839416504, - "learning_rate": 1.3455258753354932e-07, - "loss": 0.0048, - "step": 1460 - }, - { - "epoch": 8.963190184049079, - "grad_norm": 1.5094419717788696, - "learning_rate": 1.3299753831787193e-07, - "loss": 0.0011, - "step": 1461 - }, - { - "epoch": 8.969325153374234, - "grad_norm": 0.5978104472160339, - "learning_rate": 1.3145128181952737e-07, - "loss": 0.0018, - "step": 1462 - }, - { - "epoch": 8.975460122699387, - "grad_norm": 0.7072922587394714, - "learning_rate": 1.2991382378240325e-07, - "loss": 0.0032, - "step": 1463 - }, - { - "epoch": 8.98159509202454, - "grad_norm": 0.5541467666625977, - "learning_rate": 1.2838516991770355e-07, - "loss": 0.001, - "step": 1464 - }, - { - "epoch": 8.987730061349692, - "grad_norm": 0.6946907043457031, - "learning_rate": 1.2686532590392763e-07, - "loss": 0.0024, - "step": 1465 - }, - { - "epoch": 8.993865030674847, - "grad_norm": 0.3228455185890198, - "learning_rate": 1.2535429738684822e-07, - "loss": 0.0007, - "step": 1466 - }, - { - "epoch": 9.0, - "grad_norm": 2.4403252601623535, - "learning_rate": 1.238520899794915e-07, - "loss": 0.0245, - "step": 1467 - }, - { - "epoch": 9.006134969325153, - "grad_norm": 2.5279674530029297, - "learning_rate": 1.223587092621162e-07, - "loss": 0.0006, - "step": 1468 - }, - { - "epoch": 9.012269938650308, - "grad_norm": 0.08804622292518616, - "learning_rate": 1.2087416078219144e-07, - "loss": 0.0005, - "step": 1469 - }, - { - "epoch": 9.01840490797546, - "grad_norm": 0.11985688656568527, - "learning_rate": 1.1939845005437823e-07, - "loss": 0.0006, - "step": 1470 - }, - { - "epoch": 9.024539877300613, - "grad_norm": 0.08172235637903214, - "learning_rate": 1.1793158256050708e-07, - "loss": 0.0004, - "step": 1471 - }, - { - "epoch": 9.030674846625766, - "grad_norm": 0.14893503487110138, - "learning_rate": 1.1647356374955926e-07, - "loss": 0.0005, - "step": 1472 - }, - { - "epoch": 9.036809815950921, - "grad_norm": 0.1922188401222229, - "learning_rate": 1.1502439903764539e-07, - "loss": 0.0012, - "step": 1473 - }, - { - "epoch": 9.042944785276074, - "grad_norm": 0.2091587781906128, - "learning_rate": 1.1358409380798547e-07, - "loss": 0.0004, - "step": 1474 - }, - { - "epoch": 9.049079754601227, - "grad_norm": 0.3777543008327484, - "learning_rate": 1.1215265341089021e-07, - "loss": 0.0031, - "step": 1475 - }, - { - "epoch": 9.05521472392638, - "grad_norm": 0.12114719301462173, - "learning_rate": 1.1073008316373812e-07, - "loss": 0.0004, - "step": 1476 - }, - { - "epoch": 9.061349693251534, - "grad_norm": 0.7613732218742371, - "learning_rate": 1.093163883509596e-07, - "loss": 0.0056, - "step": 1477 - }, - { - "epoch": 9.067484662576687, - "grad_norm": 0.11271879076957703, - "learning_rate": 1.0791157422401499e-07, - "loss": 0.0006, - "step": 1478 - }, - { - "epoch": 9.07361963190184, - "grad_norm": 0.5275444984436035, - "learning_rate": 1.0651564600137443e-07, - "loss": 0.0013, - "step": 1479 - }, - { - "epoch": 9.079754601226995, - "grad_norm": 0.0763268992304802, - "learning_rate": 1.051286088685008e-07, - "loss": 0.0004, - "step": 1480 - }, - { - "epoch": 9.085889570552148, - "grad_norm": 0.5255539417266846, - "learning_rate": 1.0375046797782868e-07, - "loss": 0.0047, - "step": 1481 - }, - { - "epoch": 9.0920245398773, - "grad_norm": 0.0961274728178978, - "learning_rate": 1.0238122844874576e-07, - "loss": 0.0004, - "step": 1482 - }, - { - "epoch": 9.098159509202453, - "grad_norm": 0.18914999067783356, - "learning_rate": 1.0102089536757398e-07, - "loss": 0.0011, - "step": 1483 - }, - { - "epoch": 9.104294478527608, - "grad_norm": 0.14239318668842316, - "learning_rate": 9.966947378754949e-08, - "loss": 0.0011, - "step": 1484 - }, - { - "epoch": 9.110429447852761, - "grad_norm": 0.12115265429019928, - "learning_rate": 9.83269687288066e-08, - "loss": 0.0007, - "step": 1485 - }, - { - "epoch": 9.116564417177914, - "grad_norm": 0.12038591504096985, - "learning_rate": 9.699338517835611e-08, - "loss": 0.0005, - "step": 1486 - }, - { - "epoch": 9.122699386503067, - "grad_norm": 0.07863178849220276, - "learning_rate": 9.566872809006783e-08, - "loss": 0.0004, - "step": 1487 - }, - { - "epoch": 9.128834355828221, - "grad_norm": 0.19755667448043823, - "learning_rate": 9.435300238465339e-08, - "loss": 0.0007, - "step": 1488 - }, - { - "epoch": 9.134969325153374, - "grad_norm": 0.08695468306541443, - "learning_rate": 9.30462129496465e-08, - "loss": 0.0003, - "step": 1489 - }, - { - "epoch": 9.141104294478527, - "grad_norm": 0.22066617012023926, - "learning_rate": 9.174836463938464e-08, - "loss": 0.0011, - "step": 1490 - }, - { - "epoch": 9.14723926380368, - "grad_norm": 0.15969769656658173, - "learning_rate": 9.045946227499298e-08, - "loss": 0.0012, - "step": 1491 - }, - { - "epoch": 9.153374233128835, - "grad_norm": 0.31097984313964844, - "learning_rate": 8.917951064436382e-08, - "loss": 0.0015, - "step": 1492 - }, - { - "epoch": 9.159509202453988, - "grad_norm": 0.15080022811889648, - "learning_rate": 8.790851450214106e-08, - "loss": 0.0009, - "step": 1493 - }, - { - "epoch": 9.16564417177914, - "grad_norm": 0.11880502849817276, - "learning_rate": 8.664647856970076e-08, - "loss": 0.0007, - "step": 1494 - }, - { - "epoch": 9.171779141104295, - "grad_norm": 0.6681945323944092, - "learning_rate": 8.539340753513508e-08, - "loss": 0.0046, - "step": 1495 - }, - { - "epoch": 9.177914110429448, - "grad_norm": 1.5142796039581299, - "learning_rate": 8.414930605323445e-08, - "loss": 0.0442, - "step": 1496 - }, - { - "epoch": 9.184049079754601, - "grad_norm": 0.36349135637283325, - "learning_rate": 8.291417874546875e-08, - "loss": 0.0019, - "step": 1497 - }, - { - "epoch": 9.190184049079754, - "grad_norm": 0.5278675556182861, - "learning_rate": 8.168803019997312e-08, - "loss": 0.0009, - "step": 1498 - }, - { - "epoch": 9.196319018404909, - "grad_norm": 0.08181502670049667, - "learning_rate": 8.047086497152801e-08, - "loss": 0.0004, - "step": 1499 - }, - { - "epoch": 9.202453987730062, - "grad_norm": 0.22418726980686188, - "learning_rate": 7.926268758154416e-08, - "loss": 0.0014, - "step": 1500 - }, - { - "epoch": 9.208588957055214, - "grad_norm": 0.27877968549728394, - "learning_rate": 7.806350251804484e-08, - "loss": 0.001, - "step": 1501 - }, - { - "epoch": 9.214723926380367, - "grad_norm": 0.3604774475097656, - "learning_rate": 7.687331423564925e-08, - "loss": 0.0006, - "step": 1502 - }, - { - "epoch": 9.220858895705522, - "grad_norm": 0.09796755015850067, - "learning_rate": 7.569212715555663e-08, - "loss": 0.0005, - "step": 1503 - }, - { - "epoch": 9.226993865030675, - "grad_norm": 0.12454013526439667, - "learning_rate": 7.451994566552989e-08, - "loss": 0.0006, - "step": 1504 - }, - { - "epoch": 9.233128834355828, - "grad_norm": 0.13127478957176208, - "learning_rate": 7.335677411987734e-08, - "loss": 0.0006, - "step": 1505 - }, - { - "epoch": 9.239263803680982, - "grad_norm": 0.68902587890625, - "learning_rate": 7.220261683943935e-08, - "loss": 0.0037, - "step": 1506 - }, - { - "epoch": 9.245398773006135, - "grad_norm": 0.3021928071975708, - "learning_rate": 7.105747811156999e-08, - "loss": 0.001, - "step": 1507 - }, - { - "epoch": 9.251533742331288, - "grad_norm": 0.16254237294197083, - "learning_rate": 6.992136219012263e-08, - "loss": 0.0008, - "step": 1508 - }, - { - "epoch": 9.257668711656441, - "grad_norm": 0.22068247199058533, - "learning_rate": 6.879427329543414e-08, - "loss": 0.001, - "step": 1509 - }, - { - "epoch": 9.263803680981596, - "grad_norm": 0.20256245136260986, - "learning_rate": 6.76762156143071e-08, - "loss": 0.0014, - "step": 1510 - }, - { - "epoch": 9.269938650306749, - "grad_norm": 0.06691748648881912, - "learning_rate": 6.6567193299997e-08, - "loss": 0.0003, - "step": 1511 - }, - { - "epoch": 9.276073619631902, - "grad_norm": 0.12188060581684113, - "learning_rate": 6.546721047219568e-08, - "loss": 0.0003, - "step": 1512 - }, - { - "epoch": 9.282208588957054, - "grad_norm": 0.11017973721027374, - "learning_rate": 6.437627121701456e-08, - "loss": 0.0007, - "step": 1513 - }, - { - "epoch": 9.28834355828221, - "grad_norm": 0.08906184136867523, - "learning_rate": 6.329437958697282e-08, - "loss": 0.0005, - "step": 1514 - }, - { - "epoch": 9.294478527607362, - "grad_norm": 0.10575949400663376, - "learning_rate": 6.222153960097871e-08, - "loss": 0.0004, - "step": 1515 - }, - { - "epoch": 9.300613496932515, - "grad_norm": 0.07783909887075424, - "learning_rate": 6.115775524431711e-08, - "loss": 0.0004, - "step": 1516 - }, - { - "epoch": 9.30674846625767, - "grad_norm": 0.22752316296100616, - "learning_rate": 6.010303046863397e-08, - "loss": 0.0008, - "step": 1517 - }, - { - "epoch": 9.312883435582823, - "grad_norm": 0.4781089425086975, - "learning_rate": 5.905736919192107e-08, - "loss": 0.0044, - "step": 1518 - }, - { - "epoch": 9.319018404907975, - "grad_norm": 1.2014552354812622, - "learning_rate": 5.8020775298502085e-08, - "loss": 0.0016, - "step": 1519 - }, - { - "epoch": 9.325153374233128, - "grad_norm": 0.11146771907806396, - "learning_rate": 5.699325263901878e-08, - "loss": 0.0004, - "step": 1520 - }, - { - "epoch": 9.331288343558283, - "grad_norm": 0.21041418612003326, - "learning_rate": 5.597480503041486e-08, - "loss": 0.0016, - "step": 1521 - }, - { - "epoch": 9.337423312883436, - "grad_norm": 0.1907602846622467, - "learning_rate": 5.496543625592321e-08, - "loss": 0.0006, - "step": 1522 - }, - { - "epoch": 9.343558282208589, - "grad_norm": 0.7976323962211609, - "learning_rate": 5.396515006505204e-08, - "loss": 0.001, - "step": 1523 - }, - { - "epoch": 9.349693251533742, - "grad_norm": 0.10006821155548096, - "learning_rate": 5.297395017357015e-08, - "loss": 0.0004, - "step": 1524 - }, - { - "epoch": 9.355828220858896, - "grad_norm": 0.09137666970491409, - "learning_rate": 5.199184026349308e-08, - "loss": 0.0005, - "step": 1525 - }, - { - "epoch": 9.36196319018405, - "grad_norm": 0.5621616244316101, - "learning_rate": 5.1018823983070285e-08, - "loss": 0.0014, - "step": 1526 - }, - { - "epoch": 9.368098159509202, - "grad_norm": 0.12934303283691406, - "learning_rate": 5.005490494677051e-08, - "loss": 0.0009, - "step": 1527 - }, - { - "epoch": 9.374233128834355, - "grad_norm": 0.13988590240478516, - "learning_rate": 4.91000867352695e-08, - "loss": 0.0006, - "step": 1528 - }, - { - "epoch": 9.38036809815951, - "grad_norm": 0.19421879947185516, - "learning_rate": 4.815437289543562e-08, - "loss": 0.0006, - "step": 1529 - }, - { - "epoch": 9.386503067484663, - "grad_norm": 0.278499960899353, - "learning_rate": 4.7217766940317326e-08, - "loss": 0.0018, - "step": 1530 - }, - { - "epoch": 9.392638036809815, - "grad_norm": 0.12389005720615387, - "learning_rate": 4.629027234912986e-08, - "loss": 0.0007, - "step": 1531 - }, - { - "epoch": 9.39877300613497, - "grad_norm": 0.1303948014974594, - "learning_rate": 4.5371892567243336e-08, - "loss": 0.0004, - "step": 1532 - }, - { - "epoch": 9.404907975460123, - "grad_norm": 1.117344856262207, - "learning_rate": 4.4462631006167714e-08, - "loss": 0.0169, - "step": 1533 - }, - { - "epoch": 9.411042944785276, - "grad_norm": 0.1710042506456375, - "learning_rate": 4.356249104354199e-08, - "loss": 0.0005, - "step": 1534 - }, - { - "epoch": 9.417177914110429, - "grad_norm": 0.37792330980300903, - "learning_rate": 4.267147602312116e-08, - "loss": 0.0018, - "step": 1535 - }, - { - "epoch": 9.423312883435583, - "grad_norm": 0.42278361320495605, - "learning_rate": 4.178958925476401e-08, - "loss": 0.0028, - "step": 1536 - }, - { - "epoch": 9.429447852760736, - "grad_norm": 0.9310070872306824, - "learning_rate": 4.0916834014420036e-08, - "loss": 0.0124, - "step": 1537 - }, - { - "epoch": 9.43558282208589, - "grad_norm": 0.9287325739860535, - "learning_rate": 4.0053213544118116e-08, - "loss": 0.0131, - "step": 1538 - }, - { - "epoch": 9.441717791411042, - "grad_norm": 0.2695760130882263, - "learning_rate": 3.919873105195371e-08, - "loss": 0.0014, - "step": 1539 - }, - { - "epoch": 9.447852760736197, - "grad_norm": 0.2679222524166107, - "learning_rate": 3.8353389712078583e-08, - "loss": 0.0012, - "step": 1540 - }, - { - "epoch": 9.45398773006135, - "grad_norm": 0.7153877019882202, - "learning_rate": 3.7517192664685844e-08, - "loss": 0.0102, - "step": 1541 - }, - { - "epoch": 9.460122699386503, - "grad_norm": 0.19710485637187958, - "learning_rate": 3.6690143016002155e-08, - "loss": 0.0006, - "step": 1542 - }, - { - "epoch": 9.466257668711656, - "grad_norm": 0.4529936611652374, - "learning_rate": 3.587224383827331e-08, - "loss": 0.0035, - "step": 1543 - }, - { - "epoch": 9.47239263803681, - "grad_norm": 0.22579027712345123, - "learning_rate": 3.506349816975368e-08, - "loss": 0.0015, - "step": 1544 - }, - { - "epoch": 9.478527607361963, - "grad_norm": 0.08603110164403915, - "learning_rate": 3.426390901469595e-08, - "loss": 0.0004, - "step": 1545 - }, - { - "epoch": 9.484662576687116, - "grad_norm": 0.19130398333072662, - "learning_rate": 3.347347934333778e-08, - "loss": 0.0015, - "step": 1546 - }, - { - "epoch": 9.49079754601227, - "grad_norm": 0.8941642045974731, - "learning_rate": 3.2692212091893215e-08, - "loss": 0.012, - "step": 1547 - }, - { - "epoch": 9.496932515337424, - "grad_norm": 0.09985413402318954, - "learning_rate": 3.192011016253965e-08, - "loss": 0.0003, - "step": 1548 - }, - { - "epoch": 9.503067484662576, - "grad_norm": 0.15109865367412567, - "learning_rate": 3.115717642340893e-08, - "loss": 0.0004, - "step": 1549 - }, - { - "epoch": 9.50920245398773, - "grad_norm": 0.15993481874465942, - "learning_rate": 3.040341370857486e-08, - "loss": 0.0008, - "step": 1550 - }, - { - "epoch": 9.515337423312884, - "grad_norm": 0.13720917701721191, - "learning_rate": 2.9658824818044328e-08, - "loss": 0.0005, - "step": 1551 - }, - { - "epoch": 9.521472392638037, - "grad_norm": 0.06803212314844131, - "learning_rate": 2.8923412517745662e-08, - "loss": 0.0002, - "step": 1552 - }, - { - "epoch": 9.52760736196319, - "grad_norm": 0.08404265344142914, - "learning_rate": 2.819717953951917e-08, - "loss": 0.0004, - "step": 1553 - }, - { - "epoch": 9.533742331288344, - "grad_norm": 0.5321061015129089, - "learning_rate": 2.7480128581106602e-08, - "loss": 0.0065, - "step": 1554 - }, - { - "epoch": 9.539877300613497, - "grad_norm": 0.5247214436531067, - "learning_rate": 2.6772262306141438e-08, - "loss": 0.0018, - "step": 1555 - }, - { - "epoch": 9.54601226993865, - "grad_norm": 0.2725308835506439, - "learning_rate": 2.607358334413779e-08, - "loss": 0.0006, - "step": 1556 - }, - { - "epoch": 9.552147239263803, - "grad_norm": 0.5589315295219421, - "learning_rate": 2.5384094290482886e-08, - "loss": 0.0006, - "step": 1557 - }, - { - "epoch": 9.558282208588958, - "grad_norm": 0.6117374897003174, - "learning_rate": 2.4703797706425725e-08, - "loss": 0.0068, - "step": 1558 - }, - { - "epoch": 9.56441717791411, - "grad_norm": 0.3439452648162842, - "learning_rate": 2.4032696119067332e-08, - "loss": 0.0014, - "step": 1559 - }, - { - "epoch": 9.570552147239264, - "grad_norm": 0.1743037849664688, - "learning_rate": 2.337079202135273e-08, - "loss": 0.0011, - "step": 1560 - }, - { - "epoch": 9.576687116564417, - "grad_norm": 0.6570950746536255, - "learning_rate": 2.2718087872060925e-08, - "loss": 0.0025, - "step": 1561 - }, - { - "epoch": 9.582822085889571, - "grad_norm": 0.25107917189598083, - "learning_rate": 2.207458609579549e-08, - "loss": 0.0021, - "step": 1562 - }, - { - "epoch": 9.588957055214724, - "grad_norm": 0.13917990028858185, - "learning_rate": 2.144028908297624e-08, - "loss": 0.0007, - "step": 1563 - }, - { - "epoch": 9.595092024539877, - "grad_norm": 0.22606755793094635, - "learning_rate": 2.081519918982977e-08, - "loss": 0.0014, - "step": 1564 - }, - { - "epoch": 9.60122699386503, - "grad_norm": 0.4116940200328827, - "learning_rate": 2.019931873838088e-08, - "loss": 0.0008, - "step": 1565 - }, - { - "epoch": 9.607361963190185, - "grad_norm": 0.10428212583065033, - "learning_rate": 1.9592650016444503e-08, - "loss": 0.0005, - "step": 1566 - }, - { - "epoch": 9.613496932515337, - "grad_norm": 0.0740678682923317, - "learning_rate": 1.8995195277616284e-08, - "loss": 0.0004, - "step": 1567 - }, - { - "epoch": 9.61963190184049, - "grad_norm": 0.07690935581922531, - "learning_rate": 1.8406956741264247e-08, - "loss": 0.0004, - "step": 1568 - }, - { - "epoch": 9.625766871165645, - "grad_norm": 0.14602801203727722, - "learning_rate": 1.7827936592521856e-08, - "loss": 0.0014, - "step": 1569 - }, - { - "epoch": 9.631901840490798, - "grad_norm": 0.4051103889942169, - "learning_rate": 1.7258136982278296e-08, - "loss": 0.0009, - "step": 1570 - }, - { - "epoch": 9.63803680981595, - "grad_norm": 0.331938236951828, - "learning_rate": 1.6697560027171543e-08, - "loss": 0.0019, - "step": 1571 - }, - { - "epoch": 9.644171779141104, - "grad_norm": 0.6029168367385864, - "learning_rate": 1.6146207809579762e-08, - "loss": 0.0072, - "step": 1572 - }, - { - "epoch": 9.650306748466258, - "grad_norm": 0.2004910558462143, - "learning_rate": 1.5604082377614072e-08, - "loss": 0.001, - "step": 1573 - }, - { - "epoch": 9.656441717791411, - "grad_norm": 0.33825960755348206, - "learning_rate": 1.507118574511135e-08, - "loss": 0.0017, - "step": 1574 - }, - { - "epoch": 9.662576687116564, - "grad_norm": 0.7193265557289124, - "learning_rate": 1.454751989162506e-08, - "loss": 0.0106, - "step": 1575 - }, - { - "epoch": 9.668711656441717, - "grad_norm": 0.1846141815185547, - "learning_rate": 1.4033086762419989e-08, - "loss": 0.0004, - "step": 1576 - }, - { - "epoch": 9.674846625766872, - "grad_norm": 0.2692915201187134, - "learning_rate": 1.3527888268463907e-08, - "loss": 0.002, - "step": 1577 - }, - { - "epoch": 9.680981595092025, - "grad_norm": 0.19888080656528473, - "learning_rate": 1.303192628642036e-08, - "loss": 0.0007, - "step": 1578 - }, - { - "epoch": 9.687116564417177, - "grad_norm": 0.09299040585756302, - "learning_rate": 1.2545202658642008e-08, - "loss": 0.0004, - "step": 1579 - }, - { - "epoch": 9.69325153374233, - "grad_norm": 0.12221895903348923, - "learning_rate": 1.2067719193163962e-08, - "loss": 0.0005, - "step": 1580 - }, - { - "epoch": 9.699386503067485, - "grad_norm": 0.9425249099731445, - "learning_rate": 1.1599477663696845e-08, - "loss": 0.0062, - "step": 1581 - }, - { - "epoch": 9.705521472392638, - "grad_norm": 0.1449192315340042, - "learning_rate": 1.1140479809619576e-08, - "loss": 0.0005, - "step": 1582 - }, - { - "epoch": 9.71165644171779, - "grad_norm": 0.2106281816959381, - "learning_rate": 1.069072733597465e-08, - "loss": 0.0007, - "step": 1583 - }, - { - "epoch": 9.717791411042946, - "grad_norm": 0.06777317076921463, - "learning_rate": 1.025022191346009e-08, - "loss": 0.0003, - "step": 1584 - }, - { - "epoch": 9.723926380368098, - "grad_norm": 0.6169402003288269, - "learning_rate": 9.818965178423345e-09, - "loss": 0.0083, - "step": 1585 - }, - { - "epoch": 9.730061349693251, - "grad_norm": 0.18353499472141266, - "learning_rate": 9.396958732856843e-09, - "loss": 0.001, - "step": 1586 - }, - { - "epoch": 9.736196319018404, - "grad_norm": 0.18419475853443146, - "learning_rate": 8.984204144389941e-09, - "loss": 0.0006, - "step": 1587 - }, - { - "epoch": 9.742331288343559, - "grad_norm": 0.8530840277671814, - "learning_rate": 8.580702946284491e-09, - "loss": 0.0109, - "step": 1588 - }, - { - "epoch": 9.748466257668712, - "grad_norm": 0.6887766122817993, - "learning_rate": 8.186456637428453e-09, - "loss": 0.0016, - "step": 1589 - }, - { - "epoch": 9.754601226993865, - "grad_norm": 0.1355009824037552, - "learning_rate": 7.801466682331172e-09, - "loss": 0.0004, - "step": 1590 - }, - { - "epoch": 9.76073619631902, - "grad_norm": 1.123541235923767, - "learning_rate": 7.425734511117e-09, - "loss": 0.008, - "step": 1591 - }, - { - "epoch": 9.766871165644172, - "grad_norm": 0.6276746988296509, - "learning_rate": 7.059261519520022e-09, - "loss": 0.003, - "step": 1592 - }, - { - "epoch": 9.773006134969325, - "grad_norm": 0.5775916576385498, - "learning_rate": 6.702049068879613e-09, - "loss": 0.0009, - "step": 1593 - }, - { - "epoch": 9.779141104294478, - "grad_norm": 0.44135305285453796, - "learning_rate": 6.354098486135163e-09, - "loss": 0.0049, - "step": 1594 - }, - { - "epoch": 9.785276073619633, - "grad_norm": 0.06254208087921143, - "learning_rate": 6.015411063820253e-09, - "loss": 0.0003, - "step": 1595 - }, - { - "epoch": 9.791411042944786, - "grad_norm": 0.19917500019073486, - "learning_rate": 5.685988060059045e-09, - "loss": 0.001, - "step": 1596 - }, - { - "epoch": 9.797546012269938, - "grad_norm": 0.25946539640426636, - "learning_rate": 5.36583069856128e-09, - "loss": 0.0012, - "step": 1597 - }, - { - "epoch": 9.803680981595091, - "grad_norm": 0.11085817962884903, - "learning_rate": 5.054940168617018e-09, - "loss": 0.0005, - "step": 1598 - }, - { - "epoch": 9.809815950920246, - "grad_norm": 0.07764281332492828, - "learning_rate": 4.753317625093013e-09, - "loss": 0.0002, - "step": 1599 - }, - { - "epoch": 9.815950920245399, - "grad_norm": 0.13678377866744995, - "learning_rate": 4.4609641884285625e-09, - "loss": 0.0007, - "step": 1600 - }, - { - "epoch": 9.822085889570552, - "grad_norm": 0.07325509935617447, - "learning_rate": 4.17788094463023e-09, - "loss": 0.0004, - "step": 1601 - }, - { - "epoch": 9.828220858895705, - "grad_norm": 0.745182991027832, - "learning_rate": 3.904068945269346e-09, - "loss": 0.0006, - "step": 1602 - }, - { - "epoch": 9.83435582822086, - "grad_norm": 0.23189403116703033, - "learning_rate": 3.639529207476733e-09, - "loss": 0.0015, - "step": 1603 - }, - { - "epoch": 9.840490797546012, - "grad_norm": 0.06897032260894775, - "learning_rate": 3.384262713939379e-09, - "loss": 0.0004, - "step": 1604 - }, - { - "epoch": 9.846625766871165, - "grad_norm": 0.0821717157959938, - "learning_rate": 3.1382704128973818e-09, - "loss": 0.0004, - "step": 1605 - }, - { - "epoch": 9.85276073619632, - "grad_norm": 0.8984095454216003, - "learning_rate": 2.9015532181397854e-09, - "loss": 0.0007, - "step": 1606 - }, - { - "epoch": 9.858895705521473, - "grad_norm": 0.2612057626247406, - "learning_rate": 2.674112009000973e-09, - "loss": 0.0021, - "step": 1607 - }, - { - "epoch": 9.865030674846626, - "grad_norm": 0.10079237073659897, - "learning_rate": 2.4559476303584463e-09, - "loss": 0.0004, - "step": 1608 - }, - { - "epoch": 9.871165644171779, - "grad_norm": 0.15463407337665558, - "learning_rate": 2.2470608926283833e-09, - "loss": 0.0004, - "step": 1609 - }, - { - "epoch": 9.877300613496933, - "grad_norm": 0.3247759938240051, - "learning_rate": 2.0474525717639747e-09, - "loss": 0.0008, - "step": 1610 - }, - { - "epoch": 9.883435582822086, - "grad_norm": 0.5771990418434143, - "learning_rate": 1.857123409250705e-09, - "loss": 0.0035, - "step": 1611 - }, - { - "epoch": 9.889570552147239, - "grad_norm": 0.6151068210601807, - "learning_rate": 1.6760741121057966e-09, - "loss": 0.008, - "step": 1612 - }, - { - "epoch": 9.895705521472392, - "grad_norm": 0.6173699498176575, - "learning_rate": 1.504305352874047e-09, - "loss": 0.0009, - "step": 1613 - }, - { - "epoch": 9.901840490797547, - "grad_norm": 0.07602877169847488, - "learning_rate": 1.3418177696256086e-09, - "loss": 0.0003, - "step": 1614 - }, - { - "epoch": 9.9079754601227, - "grad_norm": 0.11126144230365753, - "learning_rate": 1.1886119659543227e-09, - "loss": 0.0005, - "step": 1615 - }, - { - "epoch": 9.914110429447852, - "grad_norm": 0.13721120357513428, - "learning_rate": 1.0446885109746673e-09, - "loss": 0.0008, - "step": 1616 - }, - { - "epoch": 9.920245398773005, - "grad_norm": 0.3714880049228668, - "learning_rate": 9.100479393195361e-10, - "loss": 0.0033, - "step": 1617 - }, - { - "epoch": 9.92638036809816, - "grad_norm": 0.18466363847255707, - "learning_rate": 7.846907511394052e-10, - "loss": 0.0009, - "step": 1618 - }, - { - "epoch": 9.932515337423313, - "grad_norm": 0.20958846807479858, - "learning_rate": 6.686174120990042e-10, - "loss": 0.0012, - "step": 1619 - }, - { - "epoch": 9.938650306748466, - "grad_norm": 0.6355168223381042, - "learning_rate": 5.618283533767588e-10, - "loss": 0.007, - "step": 1620 - }, - { - "epoch": 9.94478527607362, - "grad_norm": 0.10833138227462769, - "learning_rate": 4.6432397166285e-10, - "loss": 0.0004, - "step": 1621 - }, - { - "epoch": 9.950920245398773, - "grad_norm": 0.3573082685470581, - "learning_rate": 3.7610462915699255e-10, - "loss": 0.0015, - "step": 1622 - }, - { - "epoch": 9.957055214723926, - "grad_norm": 0.21547436714172363, - "learning_rate": 2.9717065356815733e-10, - "loss": 0.0007, - "step": 1623 - }, - { - "epoch": 9.963190184049079, - "grad_norm": 0.30022335052490234, - "learning_rate": 2.2752233811262901e-10, - "loss": 0.0025, - "step": 1624 - }, - { - "epoch": 9.969325153374234, - "grad_norm": 0.1985897272825241, - "learning_rate": 1.6715994151400572e-10, - "loss": 0.0015, - "step": 1625 - }, - { - "epoch": 9.975460122699387, - "grad_norm": 0.20799656212329865, - "learning_rate": 1.160836880001459e-10, - "loss": 0.001, - "step": 1626 - }, - { - "epoch": 9.98159509202454, - "grad_norm": 0.5943353176116943, - "learning_rate": 7.429376730483385e-11, - "loss": 0.0046, - "step": 1627 - }, - { - "epoch": 9.987730061349692, - "grad_norm": 0.1584414541721344, - "learning_rate": 4.179033466500393e-11, - "loss": 0.0006, - "step": 1628 - }, - { - "epoch": 9.993865030674847, - "grad_norm": 0.2899409830570221, - "learning_rate": 1.8573510821295882e-11, - "loss": 0.0016, - "step": 1629 - }, - { - "epoch": 10.0, - "grad_norm": 0.528587281703949, - "learning_rate": 4.643382017499587e-12, - "loss": 0.0036, - "step": 1630 - }, - { - "epoch": 10.0, - "step": 1630, - "total_flos": 4.036761107572982e+17, - "train_loss": 0.23243108529037226, - "train_runtime": 7070.6548, - "train_samples_per_second": 0.921, - "train_steps_per_second": 0.231 - } - ], - "logging_steps": 1, - "max_steps": 1630, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 4.036761107572982e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_correct/training_loss.png b/metallama3_8b/limo_filtered_correct/training_loss.png deleted file mode 100644 index 905d5a084341cd60bdf11aeb486967f19c7667b9..0000000000000000000000000000000000000000 Binary files a/metallama3_8b/limo_filtered_correct/training_loss.png and /dev/null differ diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00001-of-00007.safetensors deleted file mode 100644 index c1bbdcebe303525fc7afc034d862051be6bf1f6c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a6fb3a114acf5a1cacf6f39791a2fe1e6a059a7068b6565a77f50614a8f9756d -size 4886466168 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00002-of-00007.safetensors deleted file mode 100644 index 25494dfaadb920b9082b37e302481b53da8582b1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d56de5f7353956fd3773ed610827a8a99f51d0191db79394ab3c67ccc4b26bb8 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00003-of-00007.safetensors deleted file mode 100644 index de5a908237aa46185be2aae7e3e6a0fd36949fe5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ba8ce1f4d07d0e1f94eb50860d20177e2563881d96d382908d53bbb2fbfe130 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00004-of-00007.safetensors deleted file mode 100644 index 35d7b185339498be5d907996a48bbaa0b9b94d42..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2469116b3ec3746c103e6f00ef16126a1111a798730b2e670417ac526733ce1 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00005-of-00007.safetensors deleted file mode 100644 index 66f16fbba47e560e46f12551d67a94dbe961e6cb..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f35c40532ff45bc61f4f4754f5c1b3a07a9d9d7b64e48d5a96bdfca9ee4ec6b2 -size 4832007496 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00006-of-00007.safetensors deleted file mode 100644 index 0d264ee93b9e988bcdf6f4659f784cf88da23e7c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:981edb51b52e94129e888e9971f423caf71a7a8d2961f38feaedb3f5f6f50efe -size 4999813120 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00007-of-00007.safetensors deleted file mode 100644 index e83f0816adad60d99eb98c92aeddf8f5821e3dd6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8117facef4a9eb4ef4e4f4e0c003d848f97e9299344b2760f34aeeb2913d55a -size 2571158184 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_0.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_0.pth deleted file mode 100644 index c54ea122b283c04f6b60c1eedefeb301763a8f9f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:418a5f105ae834c3075024076916b2a9475918fe034c12d0dd5b6d91f1aba467 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_1.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_1.pth deleted file mode 100644 index ea57ead2533e587fe50f62107d7cb32945fe1354..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e07ace389d24bc1307b74f42a1e7b8f0117b0db853e2df64ff3f15cb92916a2 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_2.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_2.pth deleted file mode 100644 index 4689a9445d07528dc4fd91011a7f034c11773a68..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da6a990f346d7014dffb28fa2bc7d3b890bd3c53712503fce3656da48d3d6e50 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_3.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_3.pth deleted file mode 100644 index 919b5e43a96a9afdeb196f402142bc3aab67f247..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e95f356ca38179b05993f55daece0223e96fa10b9a1b9ea2102a739211333f63 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/scheduler.pt b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/scheduler.pt deleted file mode 100644 index 7d4cc97512de831638bc8f6a5dd997d5bf09b3fc..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70150f4174d98120dc65bc23f414a746e2e0799de0ced291e2790e75f33bfaf3 -size 1064 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/trainer_state.json deleted file mode 100644 index ee5ec43a657370cb1c978cde27484c565f4a94d6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-1110/trainer_state.json +++ /dev/null @@ -1,7804 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 6.0, - "eval_steps": 500, - "global_step": 1110, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.005405405405405406, - "grad_norm": 72.60939025878906, - "learning_rate": 5e-06, - "loss": 2.9165, - "step": 1 - }, - { - "epoch": 0.010810810810810811, - "grad_norm": 29.01830291748047, - "learning_rate": 4.999996395324314e-06, - "loss": 1.9314, - "step": 2 - }, - { - "epoch": 0.016216216216216217, - "grad_norm": 21.44908332824707, - "learning_rate": 4.99998558130765e-06, - "loss": 1.5709, - "step": 3 - }, - { - "epoch": 0.021621621621621623, - "grad_norm": 4.490907669067383, - "learning_rate": 4.999967557981192e-06, - "loss": 0.8099, - "step": 4 - }, - { - "epoch": 0.02702702702702703, - "grad_norm": 4.000796794891357, - "learning_rate": 4.999942325396917e-06, - "loss": 0.9021, - "step": 5 - }, - { - "epoch": 0.032432432432432434, - "grad_norm": 18.513282775878906, - "learning_rate": 4.999909883627588e-06, - "loss": 1.7972, - "step": 6 - }, - { - "epoch": 0.03783783783783784, - "grad_norm": 3.5735981464385986, - "learning_rate": 4.999870232766757e-06, - "loss": 1.4306, - "step": 7 - }, - { - "epoch": 0.043243243243243246, - "grad_norm": 3.1145193576812744, - "learning_rate": 4.9998233729287696e-06, - "loss": 1.051, - "step": 8 - }, - { - "epoch": 0.04864864864864865, - "grad_norm": 3.856376886367798, - "learning_rate": 4.999769304248755e-06, - "loss": 0.8089, - "step": 9 - }, - { - "epoch": 0.05405405405405406, - "grad_norm": 4.05589485168457, - "learning_rate": 4.9997080268826344e-06, - "loss": 1.0999, - "step": 10 - }, - { - "epoch": 0.05945945945945946, - "grad_norm": 13.784229278564453, - "learning_rate": 4.9996395410071165e-06, - "loss": 1.2831, - "step": 11 - }, - { - "epoch": 0.06486486486486487, - "grad_norm": 6.079237937927246, - "learning_rate": 4.999563846819696e-06, - "loss": 1.2874, - "step": 12 - }, - { - "epoch": 0.07027027027027027, - "grad_norm": 4.5971245765686035, - "learning_rate": 4.999480944538655e-06, - "loss": 0.96, - "step": 13 - }, - { - "epoch": 0.07567567567567568, - "grad_norm": 4.916017532348633, - "learning_rate": 4.999390834403063e-06, - "loss": 0.9869, - "step": 14 - }, - { - "epoch": 0.08108108108108109, - "grad_norm": 3.2311055660247803, - "learning_rate": 4.999293516672773e-06, - "loss": 0.9293, - "step": 15 - }, - { - "epoch": 0.08648648648648649, - "grad_norm": 3.3040921688079834, - "learning_rate": 4.9991889916284255e-06, - "loss": 0.8914, - "step": 16 - }, - { - "epoch": 0.0918918918918919, - "grad_norm": 3.794267416000366, - "learning_rate": 4.999077259571442e-06, - "loss": 1.0176, - "step": 17 - }, - { - "epoch": 0.0972972972972973, - "grad_norm": 4.788509845733643, - "learning_rate": 4.998958320824031e-06, - "loss": 1.0259, - "step": 18 - }, - { - "epoch": 0.10270270270270271, - "grad_norm": 10.027527809143066, - "learning_rate": 4.998832175729179e-06, - "loss": 1.3356, - "step": 19 - }, - { - "epoch": 0.10810810810810811, - "grad_norm": 4.612483978271484, - "learning_rate": 4.998698824650656e-06, - "loss": 1.4486, - "step": 20 - }, - { - "epoch": 0.11351351351351352, - "grad_norm": 3.8676936626434326, - "learning_rate": 4.998558267973014e-06, - "loss": 0.8372, - "step": 21 - }, - { - "epoch": 0.11891891891891893, - "grad_norm": 2.9611001014709473, - "learning_rate": 4.998410506101579e-06, - "loss": 0.7931, - "step": 22 - }, - { - "epoch": 0.12432432432432433, - "grad_norm": 5.508745193481445, - "learning_rate": 4.9982555394624595e-06, - "loss": 1.3022, - "step": 23 - }, - { - "epoch": 0.12972972972972974, - "grad_norm": 3.434845209121704, - "learning_rate": 4.998093368502539e-06, - "loss": 0.9739, - "step": 24 - }, - { - "epoch": 0.13513513513513514, - "grad_norm": 4.736802101135254, - "learning_rate": 4.9979239936894765e-06, - "loss": 1.1154, - "step": 25 - }, - { - "epoch": 0.14054054054054055, - "grad_norm": 3.69411039352417, - "learning_rate": 4.997747415511705e-06, - "loss": 0.7543, - "step": 26 - }, - { - "epoch": 0.14594594594594595, - "grad_norm": 2.8646645545959473, - "learning_rate": 4.997563634478428e-06, - "loss": 0.7278, - "step": 27 - }, - { - "epoch": 0.15135135135135136, - "grad_norm": 6.56904935836792, - "learning_rate": 4.997372651119626e-06, - "loss": 0.8167, - "step": 28 - }, - { - "epoch": 0.15675675675675677, - "grad_norm": 2.955914258956909, - "learning_rate": 4.997174465986044e-06, - "loss": 0.8031, - "step": 29 - }, - { - "epoch": 0.16216216216216217, - "grad_norm": 2.5714259147644043, - "learning_rate": 4.996969079649196e-06, - "loss": 0.689, - "step": 30 - }, - { - "epoch": 0.16756756756756758, - "grad_norm": 3.5165364742279053, - "learning_rate": 4.996756492701362e-06, - "loss": 0.8059, - "step": 31 - }, - { - "epoch": 0.17297297297297298, - "grad_norm": 3.2861921787261963, - "learning_rate": 4.996536705755591e-06, - "loss": 0.9658, - "step": 32 - }, - { - "epoch": 0.1783783783783784, - "grad_norm": 2.962470531463623, - "learning_rate": 4.996309719445687e-06, - "loss": 0.8349, - "step": 33 - }, - { - "epoch": 0.1837837837837838, - "grad_norm": 2.7694804668426514, - "learning_rate": 4.996075534426223e-06, - "loss": 0.8287, - "step": 34 - }, - { - "epoch": 0.1891891891891892, - "grad_norm": 3.405071258544922, - "learning_rate": 4.995834151372526e-06, - "loss": 1.1211, - "step": 35 - }, - { - "epoch": 0.1945945945945946, - "grad_norm": 2.8680710792541504, - "learning_rate": 4.995585570980685e-06, - "loss": 1.0841, - "step": 36 - }, - { - "epoch": 0.2, - "grad_norm": 3.341021776199341, - "learning_rate": 4.995329793967537e-06, - "loss": 0.6182, - "step": 37 - }, - { - "epoch": 0.20540540540540542, - "grad_norm": 3.0639379024505615, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.7647, - "step": 38 - }, - { - "epoch": 0.21081081081081082, - "grad_norm": 3.225759983062744, - "learning_rate": 4.994796653048457e-06, - "loss": 0.8691, - "step": 39 - }, - { - "epoch": 0.21621621621621623, - "grad_norm": 4.56926155090332, - "learning_rate": 4.994519290679965e-06, - "loss": 1.0404, - "step": 40 - }, - { - "epoch": 0.22162162162162163, - "grad_norm": 4.871571063995361, - "learning_rate": 4.994234734765043e-06, - "loss": 1.1877, - "step": 41 - }, - { - "epoch": 0.22702702702702704, - "grad_norm": 3.672215700149536, - "learning_rate": 4.993942986124278e-06, - "loss": 0.959, - "step": 42 - }, - { - "epoch": 0.23243243243243245, - "grad_norm": 3.184683322906494, - "learning_rate": 4.9936440455989975e-06, - "loss": 0.9249, - "step": 43 - }, - { - "epoch": 0.23783783783783785, - "grad_norm": 2.7092034816741943, - "learning_rate": 4.993337914051266e-06, - "loss": 0.6899, - "step": 44 - }, - { - "epoch": 0.24324324324324326, - "grad_norm": 3.153764486312866, - "learning_rate": 4.99302459236389e-06, - "loss": 0.9075, - "step": 45 - }, - { - "epoch": 0.24864864864864866, - "grad_norm": 3.3629748821258545, - "learning_rate": 4.992704081440407e-06, - "loss": 0.785, - "step": 46 - }, - { - "epoch": 0.25405405405405407, - "grad_norm": 4.478365898132324, - "learning_rate": 4.992376382205088e-06, - "loss": 1.008, - "step": 47 - }, - { - "epoch": 0.2594594594594595, - "grad_norm": 3.4001641273498535, - "learning_rate": 4.992041495602932e-06, - "loss": 0.7751, - "step": 48 - }, - { - "epoch": 0.2648648648648649, - "grad_norm": 2.522662878036499, - "learning_rate": 4.991699422599664e-06, - "loss": 0.9022, - "step": 49 - }, - { - "epoch": 0.2702702702702703, - "grad_norm": 2.764458179473877, - "learning_rate": 4.991350164181735e-06, - "loss": 0.8801, - "step": 50 - }, - { - "epoch": 0.2756756756756757, - "grad_norm": 2.814859628677368, - "learning_rate": 4.990993721356317e-06, - "loss": 0.7045, - "step": 51 - }, - { - "epoch": 0.2810810810810811, - "grad_norm": 2.441311836242676, - "learning_rate": 4.990630095151296e-06, - "loss": 0.7312, - "step": 52 - }, - { - "epoch": 0.2864864864864865, - "grad_norm": 2.4443013668060303, - "learning_rate": 4.9902592866152765e-06, - "loss": 0.9609, - "step": 53 - }, - { - "epoch": 0.2918918918918919, - "grad_norm": 2.2934701442718506, - "learning_rate": 4.989881296817575e-06, - "loss": 0.5753, - "step": 54 - }, - { - "epoch": 0.2972972972972973, - "grad_norm": 2.6286847591400146, - "learning_rate": 4.989496126848215e-06, - "loss": 0.5118, - "step": 55 - }, - { - "epoch": 0.3027027027027027, - "grad_norm": 3.6817069053649902, - "learning_rate": 4.989103777817928e-06, - "loss": 1.1261, - "step": 56 - }, - { - "epoch": 0.3081081081081081, - "grad_norm": 3.011197566986084, - "learning_rate": 4.988704250858145e-06, - "loss": 0.7823, - "step": 57 - }, - { - "epoch": 0.31351351351351353, - "grad_norm": 2.5490806102752686, - "learning_rate": 4.988297547121e-06, - "loss": 0.6019, - "step": 58 - }, - { - "epoch": 0.31891891891891894, - "grad_norm": 3.0803146362304688, - "learning_rate": 4.98788366777932e-06, - "loss": 0.825, - "step": 59 - }, - { - "epoch": 0.32432432432432434, - "grad_norm": 3.015730619430542, - "learning_rate": 4.987462614026625e-06, - "loss": 0.7667, - "step": 60 - }, - { - "epoch": 0.32972972972972975, - "grad_norm": 2.5371594429016113, - "learning_rate": 4.987034387077126e-06, - "loss": 0.8051, - "step": 61 - }, - { - "epoch": 0.33513513513513515, - "grad_norm": 2.6414010524749756, - "learning_rate": 4.986598988165718e-06, - "loss": 0.6895, - "step": 62 - }, - { - "epoch": 0.34054054054054056, - "grad_norm": 3.065131187438965, - "learning_rate": 4.9861564185479785e-06, - "loss": 0.9268, - "step": 63 - }, - { - "epoch": 0.34594594594594597, - "grad_norm": 2.5708694458007812, - "learning_rate": 4.985706679500163e-06, - "loss": 0.9854, - "step": 64 - }, - { - "epoch": 0.35135135135135137, - "grad_norm": 2.768915891647339, - "learning_rate": 4.9852497723192025e-06, - "loss": 0.8083, - "step": 65 - }, - { - "epoch": 0.3567567567567568, - "grad_norm": 2.567901849746704, - "learning_rate": 4.9847856983227e-06, - "loss": 0.9098, - "step": 66 - }, - { - "epoch": 0.3621621621621622, - "grad_norm": 2.5766549110412598, - "learning_rate": 4.984314458848923e-06, - "loss": 0.8881, - "step": 67 - }, - { - "epoch": 0.3675675675675676, - "grad_norm": 2.9778389930725098, - "learning_rate": 4.983836055256804e-06, - "loss": 0.9877, - "step": 68 - }, - { - "epoch": 0.372972972972973, - "grad_norm": 2.7225165367126465, - "learning_rate": 4.983350488925935e-06, - "loss": 0.8282, - "step": 69 - }, - { - "epoch": 0.3783783783783784, - "grad_norm": 2.702287197113037, - "learning_rate": 4.982857761256564e-06, - "loss": 1.1756, - "step": 70 - }, - { - "epoch": 0.3837837837837838, - "grad_norm": 2.9815568923950195, - "learning_rate": 4.982357873669589e-06, - "loss": 0.8114, - "step": 71 - }, - { - "epoch": 0.3891891891891892, - "grad_norm": 3.27150297164917, - "learning_rate": 4.981850827606556e-06, - "loss": 0.6763, - "step": 72 - }, - { - "epoch": 0.3945945945945946, - "grad_norm": 2.568423271179199, - "learning_rate": 4.981336624529655e-06, - "loss": 0.9372, - "step": 73 - }, - { - "epoch": 0.4, - "grad_norm": 2.621175527572632, - "learning_rate": 4.980815265921714e-06, - "loss": 1.0155, - "step": 74 - }, - { - "epoch": 0.40540540540540543, - "grad_norm": 2.62827205657959, - "learning_rate": 4.980286753286196e-06, - "loss": 0.949, - "step": 75 - }, - { - "epoch": 0.41081081081081083, - "grad_norm": 2.9462146759033203, - "learning_rate": 4.979751088147192e-06, - "loss": 1.0134, - "step": 76 - }, - { - "epoch": 0.41621621621621624, - "grad_norm": 2.814852714538574, - "learning_rate": 4.979208272049425e-06, - "loss": 0.9722, - "step": 77 - }, - { - "epoch": 0.42162162162162165, - "grad_norm": 4.177679538726807, - "learning_rate": 4.978658306558235e-06, - "loss": 1.2259, - "step": 78 - }, - { - "epoch": 0.42702702702702705, - "grad_norm": 2.813084125518799, - "learning_rate": 4.978101193259578e-06, - "loss": 0.834, - "step": 79 - }, - { - "epoch": 0.43243243243243246, - "grad_norm": 2.71824049949646, - "learning_rate": 4.977536933760025e-06, - "loss": 0.6151, - "step": 80 - }, - { - "epoch": 0.43783783783783786, - "grad_norm": 4.992153167724609, - "learning_rate": 4.976965529686755e-06, - "loss": 1.0475, - "step": 81 - }, - { - "epoch": 0.44324324324324327, - "grad_norm": 2.4810822010040283, - "learning_rate": 4.976386982687548e-06, - "loss": 0.8324, - "step": 82 - }, - { - "epoch": 0.4486486486486487, - "grad_norm": 4.509149074554443, - "learning_rate": 4.9758012944307845e-06, - "loss": 0.997, - "step": 83 - }, - { - "epoch": 0.4540540540540541, - "grad_norm": 3.114325761795044, - "learning_rate": 4.975208466605436e-06, - "loss": 1.2024, - "step": 84 - }, - { - "epoch": 0.4594594594594595, - "grad_norm": 3.297091007232666, - "learning_rate": 4.974608500921064e-06, - "loss": 0.9146, - "step": 85 - }, - { - "epoch": 0.4648648648648649, - "grad_norm": 2.824475049972534, - "learning_rate": 4.974001399107816e-06, - "loss": 0.7181, - "step": 86 - }, - { - "epoch": 0.4702702702702703, - "grad_norm": 20.262290954589844, - "learning_rate": 4.973387162916415e-06, - "loss": 0.8599, - "step": 87 - }, - { - "epoch": 0.4756756756756757, - "grad_norm": 4.015744686126709, - "learning_rate": 4.972765794118158e-06, - "loss": 0.6081, - "step": 88 - }, - { - "epoch": 0.4810810810810811, - "grad_norm": 2.8033058643341064, - "learning_rate": 4.9721372945049114e-06, - "loss": 0.8764, - "step": 89 - }, - { - "epoch": 0.4864864864864865, - "grad_norm": 5.271846294403076, - "learning_rate": 4.971501665889107e-06, - "loss": 0.8622, - "step": 90 - }, - { - "epoch": 0.4918918918918919, - "grad_norm": 2.557264804840088, - "learning_rate": 4.9708589101037306e-06, - "loss": 0.5523, - "step": 91 - }, - { - "epoch": 0.4972972972972973, - "grad_norm": 4.342173099517822, - "learning_rate": 4.970209029002325e-06, - "loss": 0.8922, - "step": 92 - }, - { - "epoch": 0.5027027027027027, - "grad_norm": 2.950364351272583, - "learning_rate": 4.969552024458977e-06, - "loss": 0.9455, - "step": 93 - }, - { - "epoch": 0.5081081081081081, - "grad_norm": 2.6453042030334473, - "learning_rate": 4.968887898368318e-06, - "loss": 0.8342, - "step": 94 - }, - { - "epoch": 0.5135135135135135, - "grad_norm": 3.486766815185547, - "learning_rate": 4.968216652645515e-06, - "loss": 0.8476, - "step": 95 - }, - { - "epoch": 0.518918918918919, - "grad_norm": 2.884152889251709, - "learning_rate": 4.967538289226268e-06, - "loss": 0.8879, - "step": 96 - }, - { - "epoch": 0.5243243243243243, - "grad_norm": 2.4130594730377197, - "learning_rate": 4.966852810066798e-06, - "loss": 0.7114, - "step": 97 - }, - { - "epoch": 0.5297297297297298, - "grad_norm": 3.182410955429077, - "learning_rate": 4.9661602171438524e-06, - "loss": 0.6757, - "step": 98 - }, - { - "epoch": 0.5351351351351351, - "grad_norm": 2.5027542114257812, - "learning_rate": 4.965460512454687e-06, - "loss": 0.8029, - "step": 99 - }, - { - "epoch": 0.5405405405405406, - "grad_norm": 2.3096024990081787, - "learning_rate": 4.964753698017071e-06, - "loss": 0.842, - "step": 100 - }, - { - "epoch": 0.5459459459459459, - "grad_norm": 2.875657081604004, - "learning_rate": 4.964039775869271e-06, - "loss": 0.6339, - "step": 101 - }, - { - "epoch": 0.5513513513513514, - "grad_norm": 2.505406141281128, - "learning_rate": 4.963318748070056e-06, - "loss": 0.7743, - "step": 102 - }, - { - "epoch": 0.5567567567567567, - "grad_norm": 3.552562713623047, - "learning_rate": 4.9625906166986815e-06, - "loss": 0.926, - "step": 103 - }, - { - "epoch": 0.5621621621621622, - "grad_norm": 2.717942476272583, - "learning_rate": 4.961855383854889e-06, - "loss": 0.7037, - "step": 104 - }, - { - "epoch": 0.5675675675675675, - "grad_norm": 2.5049386024475098, - "learning_rate": 4.961113051658901e-06, - "loss": 0.561, - "step": 105 - }, - { - "epoch": 0.572972972972973, - "grad_norm": 2.3112900257110596, - "learning_rate": 4.96036362225141e-06, - "loss": 0.7316, - "step": 106 - }, - { - "epoch": 0.5783783783783784, - "grad_norm": 2.470257520675659, - "learning_rate": 4.959607097793575e-06, - "loss": 0.6426, - "step": 107 - }, - { - "epoch": 0.5837837837837838, - "grad_norm": 3.8040788173675537, - "learning_rate": 4.9588434804670176e-06, - "loss": 1.0044, - "step": 108 - }, - { - "epoch": 0.5891891891891892, - "grad_norm": 3.143547296524048, - "learning_rate": 4.958072772473812e-06, - "loss": 0.9219, - "step": 109 - }, - { - "epoch": 0.5945945945945946, - "grad_norm": 3.5052590370178223, - "learning_rate": 4.9572949760364795e-06, - "loss": 0.6056, - "step": 110 - }, - { - "epoch": 0.6, - "grad_norm": 3.064009428024292, - "learning_rate": 4.9565100933979835e-06, - "loss": 0.6346, - "step": 111 - }, - { - "epoch": 0.6054054054054054, - "grad_norm": 2.694610595703125, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.9856, - "step": 112 - }, - { - "epoch": 0.6108108108108108, - "grad_norm": 2.5885775089263916, - "learning_rate": 4.954919078591521e-06, - "loss": 0.8669, - "step": 113 - }, - { - "epoch": 0.6162162162162163, - "grad_norm": 2.593609571456909, - "learning_rate": 4.954112951011628e-06, - "loss": 0.7201, - "step": 114 - }, - { - "epoch": 0.6216216216216216, - "grad_norm": 3.3045759201049805, - "learning_rate": 4.9532997464067065e-06, - "loss": 0.9095, - "step": 115 - }, - { - "epoch": 0.6270270270270271, - "grad_norm": 2.8144869804382324, - "learning_rate": 4.952479467121828e-06, - "loss": 1.0213, - "step": 116 - }, - { - "epoch": 0.6324324324324324, - "grad_norm": 2.5460312366485596, - "learning_rate": 4.951652115522463e-06, - "loss": 1.1154, - "step": 117 - }, - { - "epoch": 0.6378378378378379, - "grad_norm": 2.795137405395508, - "learning_rate": 4.950817693994481e-06, - "loss": 0.691, - "step": 118 - }, - { - "epoch": 0.6432432432432432, - "grad_norm": 2.4979195594787598, - "learning_rate": 4.949976204944135e-06, - "loss": 0.7224, - "step": 119 - }, - { - "epoch": 0.6486486486486487, - "grad_norm": 3.3131983280181885, - "learning_rate": 4.949127650798063e-06, - "loss": 0.9256, - "step": 120 - }, - { - "epoch": 0.654054054054054, - "grad_norm": 2.9060285091400146, - "learning_rate": 4.948272034003275e-06, - "loss": 0.6892, - "step": 121 - }, - { - "epoch": 0.6594594594594595, - "grad_norm": 3.695594549179077, - "learning_rate": 4.947409357027148e-06, - "loss": 0.5878, - "step": 122 - }, - { - "epoch": 0.6648648648648648, - "grad_norm": 3.1250460147857666, - "learning_rate": 4.9465396223574165e-06, - "loss": 0.9904, - "step": 123 - }, - { - "epoch": 0.6702702702702703, - "grad_norm": 4.024891376495361, - "learning_rate": 4.945662832502172e-06, - "loss": 1.1592, - "step": 124 - }, - { - "epoch": 0.6756756756756757, - "grad_norm": 2.6886494159698486, - "learning_rate": 4.944778989989847e-06, - "loss": 1.0041, - "step": 125 - }, - { - "epoch": 0.6810810810810811, - "grad_norm": 2.366912841796875, - "learning_rate": 4.943888097369216e-06, - "loss": 0.7045, - "step": 126 - }, - { - "epoch": 0.6864864864864865, - "grad_norm": 2.394932270050049, - "learning_rate": 4.942990157209381e-06, - "loss": 0.6685, - "step": 127 - }, - { - "epoch": 0.6918918918918919, - "grad_norm": 2.61933970451355, - "learning_rate": 4.9420851720997674e-06, - "loss": 0.8812, - "step": 128 - }, - { - "epoch": 0.6972972972972973, - "grad_norm": 2.7395646572113037, - "learning_rate": 4.94117314465012e-06, - "loss": 1.3014, - "step": 129 - }, - { - "epoch": 0.7027027027027027, - "grad_norm": 3.065484046936035, - "learning_rate": 4.940254077490487e-06, - "loss": 0.6978, - "step": 130 - }, - { - "epoch": 0.7081081081081081, - "grad_norm": 2.895038366317749, - "learning_rate": 4.939327973271222e-06, - "loss": 0.6249, - "step": 131 - }, - { - "epoch": 0.7135135135135136, - "grad_norm": 3.1773312091827393, - "learning_rate": 4.9383948346629665e-06, - "loss": 0.6423, - "step": 132 - }, - { - "epoch": 0.7189189189189189, - "grad_norm": 2.2378008365631104, - "learning_rate": 4.937454664356652e-06, - "loss": 0.7193, - "step": 133 - }, - { - "epoch": 0.7243243243243244, - "grad_norm": 2.5673701763153076, - "learning_rate": 4.9365074650634855e-06, - "loss": 0.7065, - "step": 134 - }, - { - "epoch": 0.7297297297297297, - "grad_norm": 2.7348387241363525, - "learning_rate": 4.9355532395149445e-06, - "loss": 1.0046, - "step": 135 - }, - { - "epoch": 0.7351351351351352, - "grad_norm": 2.391741991043091, - "learning_rate": 4.9345919904627655e-06, - "loss": 0.6771, - "step": 136 - }, - { - "epoch": 0.7405405405405405, - "grad_norm": 2.2096705436706543, - "learning_rate": 4.933623720678944e-06, - "loss": 0.6589, - "step": 137 - }, - { - "epoch": 0.745945945945946, - "grad_norm": 3.0840072631835938, - "learning_rate": 4.932648432955718e-06, - "loss": 0.8755, - "step": 138 - }, - { - "epoch": 0.7513513513513513, - "grad_norm": 2.4970428943634033, - "learning_rate": 4.931666130105564e-06, - "loss": 0.6685, - "step": 139 - }, - { - "epoch": 0.7567567567567568, - "grad_norm": 4.315455436706543, - "learning_rate": 4.930676814961189e-06, - "loss": 0.8101, - "step": 140 - }, - { - "epoch": 0.7621621621621621, - "grad_norm": 5.388065814971924, - "learning_rate": 4.92968049037552e-06, - "loss": 0.8193, - "step": 141 - }, - { - "epoch": 0.7675675675675676, - "grad_norm": 2.6107139587402344, - "learning_rate": 4.9286771592217005e-06, - "loss": 0.7852, - "step": 142 - }, - { - "epoch": 0.772972972972973, - "grad_norm": 3.936556577682495, - "learning_rate": 4.927666824393076e-06, - "loss": 1.0388, - "step": 143 - }, - { - "epoch": 0.7783783783783784, - "grad_norm": 2.74424409866333, - "learning_rate": 4.926649488803191e-06, - "loss": 0.8266, - "step": 144 - }, - { - "epoch": 0.7837837837837838, - "grad_norm": 2.8998451232910156, - "learning_rate": 4.925625155385776e-06, - "loss": 0.4895, - "step": 145 - }, - { - "epoch": 0.7891891891891892, - "grad_norm": 3.0631520748138428, - "learning_rate": 4.924593827094743e-06, - "loss": 0.8759, - "step": 146 - }, - { - "epoch": 0.7945945945945946, - "grad_norm": 3.233267307281494, - "learning_rate": 4.923555506904176e-06, - "loss": 0.701, - "step": 147 - }, - { - "epoch": 0.8, - "grad_norm": 2.87701416015625, - "learning_rate": 4.922510197808321e-06, - "loss": 1.1327, - "step": 148 - }, - { - "epoch": 0.8054054054054054, - "grad_norm": 3.650576114654541, - "learning_rate": 4.921457902821578e-06, - "loss": 0.7587, - "step": 149 - }, - { - "epoch": 0.8108108108108109, - "grad_norm": 3.232112407684326, - "learning_rate": 4.920398624978493e-06, - "loss": 1.2158, - "step": 150 - }, - { - "epoch": 0.8162162162162162, - "grad_norm": 2.468384027481079, - "learning_rate": 4.919332367333748e-06, - "loss": 0.6852, - "step": 151 - }, - { - "epoch": 0.8216216216216217, - "grad_norm": 2.5947415828704834, - "learning_rate": 4.918259132962154e-06, - "loss": 0.6611, - "step": 152 - }, - { - "epoch": 0.827027027027027, - "grad_norm": 3.0171427726745605, - "learning_rate": 4.917178924958638e-06, - "loss": 0.7327, - "step": 153 - }, - { - "epoch": 0.8324324324324325, - "grad_norm": 3.293184518814087, - "learning_rate": 4.916091746438243e-06, - "loss": 0.8528, - "step": 154 - }, - { - "epoch": 0.8378378378378378, - "grad_norm": 4.0570969581604, - "learning_rate": 4.9149976005361085e-06, - "loss": 0.9141, - "step": 155 - }, - { - "epoch": 0.8432432432432433, - "grad_norm": 2.8782784938812256, - "learning_rate": 4.913896490407467e-06, - "loss": 1.1132, - "step": 156 - }, - { - "epoch": 0.8486486486486486, - "grad_norm": 2.5671517848968506, - "learning_rate": 4.912788419227635e-06, - "loss": 0.7587, - "step": 157 - }, - { - "epoch": 0.8540540540540541, - "grad_norm": 2.9445390701293945, - "learning_rate": 4.911673390192002e-06, - "loss": 0.9227, - "step": 158 - }, - { - "epoch": 0.8594594594594595, - "grad_norm": 2.472595453262329, - "learning_rate": 4.910551406516023e-06, - "loss": 0.8154, - "step": 159 - }, - { - "epoch": 0.8648648648648649, - "grad_norm": 2.5233397483825684, - "learning_rate": 4.909422471435207e-06, - "loss": 0.9897, - "step": 160 - }, - { - "epoch": 0.8702702702702703, - "grad_norm": 3.3919546604156494, - "learning_rate": 4.90828658820511e-06, - "loss": 0.6162, - "step": 161 - }, - { - "epoch": 0.8756756756756757, - "grad_norm": 3.060908555984497, - "learning_rate": 4.907143760101325e-06, - "loss": 0.5734, - "step": 162 - }, - { - "epoch": 0.8810810810810811, - "grad_norm": 3.4584782123565674, - "learning_rate": 4.905993990419472e-06, - "loss": 0.8328, - "step": 163 - }, - { - "epoch": 0.8864864864864865, - "grad_norm": 2.936570644378662, - "learning_rate": 4.904837282475187e-06, - "loss": 0.6787, - "step": 164 - }, - { - "epoch": 0.8918918918918919, - "grad_norm": 2.564837694168091, - "learning_rate": 4.9036736396041165e-06, - "loss": 0.9658, - "step": 165 - }, - { - "epoch": 0.8972972972972973, - "grad_norm": 3.2509360313415527, - "learning_rate": 4.902503065161905e-06, - "loss": 0.7899, - "step": 166 - }, - { - "epoch": 0.9027027027027027, - "grad_norm": 2.9730329513549805, - "learning_rate": 4.901325562524185e-06, - "loss": 0.9476, - "step": 167 - }, - { - "epoch": 0.9081081081081082, - "grad_norm": 3.044980049133301, - "learning_rate": 4.900141135086569e-06, - "loss": 0.7589, - "step": 168 - }, - { - "epoch": 0.9135135135135135, - "grad_norm": 3.030585527420044, - "learning_rate": 4.898949786264638e-06, - "loss": 0.6724, - "step": 169 - }, - { - "epoch": 0.918918918918919, - "grad_norm": 2.249122142791748, - "learning_rate": 4.897751519493933e-06, - "loss": 0.6968, - "step": 170 - }, - { - "epoch": 0.9243243243243243, - "grad_norm": 2.9816982746124268, - "learning_rate": 4.896546338229945e-06, - "loss": 0.7984, - "step": 171 - }, - { - "epoch": 0.9297297297297298, - "grad_norm": 2.415736675262451, - "learning_rate": 4.8953342459481034e-06, - "loss": 0.6109, - "step": 172 - }, - { - "epoch": 0.9351351351351351, - "grad_norm": 2.740518808364868, - "learning_rate": 4.894115246143768e-06, - "loss": 0.8126, - "step": 173 - }, - { - "epoch": 0.9405405405405406, - "grad_norm": 2.7610201835632324, - "learning_rate": 4.892889342332218e-06, - "loss": 0.6862, - "step": 174 - }, - { - "epoch": 0.9459459459459459, - "grad_norm": 3.057025194168091, - "learning_rate": 4.891656538048642e-06, - "loss": 0.9895, - "step": 175 - }, - { - "epoch": 0.9513513513513514, - "grad_norm": 2.569751262664795, - "learning_rate": 4.890416836848128e-06, - "loss": 0.8481, - "step": 176 - }, - { - "epoch": 0.9567567567567568, - "grad_norm": 2.4443397521972656, - "learning_rate": 4.889170242305652e-06, - "loss": 0.6478, - "step": 177 - }, - { - "epoch": 0.9621621621621622, - "grad_norm": 2.5009846687316895, - "learning_rate": 4.887916758016069e-06, - "loss": 0.9714, - "step": 178 - }, - { - "epoch": 0.9675675675675676, - "grad_norm": 3.101975202560425, - "learning_rate": 4.886656387594104e-06, - "loss": 1.1264, - "step": 179 - }, - { - "epoch": 0.972972972972973, - "grad_norm": 2.6144704818725586, - "learning_rate": 4.885389134674338e-06, - "loss": 0.7664, - "step": 180 - }, - { - "epoch": 0.9783783783783784, - "grad_norm": 2.5834381580352783, - "learning_rate": 4.884115002911197e-06, - "loss": 0.6131, - "step": 181 - }, - { - "epoch": 0.9837837837837838, - "grad_norm": 2.5378055572509766, - "learning_rate": 4.88283399597895e-06, - "loss": 0.8733, - "step": 182 - }, - { - "epoch": 0.9891891891891892, - "grad_norm": 2.4095377922058105, - "learning_rate": 4.881546117571686e-06, - "loss": 0.643, - "step": 183 - }, - { - "epoch": 0.9945945945945946, - "grad_norm": 2.9554507732391357, - "learning_rate": 4.8802513714033135e-06, - "loss": 0.7287, - "step": 184 - }, - { - "epoch": 1.0, - "grad_norm": 2.8279213905334473, - "learning_rate": 4.878949761207545e-06, - "loss": 0.9927, - "step": 185 - }, - { - "epoch": 1.0054054054054054, - "grad_norm": 2.9361412525177, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.66, - "step": 186 - }, - { - "epoch": 1.0108108108108107, - "grad_norm": 3.392244338989258, - "learning_rate": 4.876325963767623e-06, - "loss": 0.594, - "step": 187 - }, - { - "epoch": 1.0162162162162163, - "grad_norm": 2.6276044845581055, - "learning_rate": 4.875003784089822e-06, - "loss": 0.5825, - "step": 188 - }, - { - "epoch": 1.0216216216216216, - "grad_norm": 2.2875545024871826, - "learning_rate": 4.873674755517305e-06, - "loss": 0.6594, - "step": 189 - }, - { - "epoch": 1.027027027027027, - "grad_norm": 2.8086795806884766, - "learning_rate": 4.872338881882645e-06, - "loss": 0.7536, - "step": 190 - }, - { - "epoch": 1.0324324324324325, - "grad_norm": 2.3685200214385986, - "learning_rate": 4.870996167038154e-06, - "loss": 0.4849, - "step": 191 - }, - { - "epoch": 1.037837837837838, - "grad_norm": 3.0264766216278076, - "learning_rate": 4.869646614855877e-06, - "loss": 0.3771, - "step": 192 - }, - { - "epoch": 1.0432432432432432, - "grad_norm": 4.335122108459473, - "learning_rate": 4.868290229227567e-06, - "loss": 0.8545, - "step": 193 - }, - { - "epoch": 1.0486486486486486, - "grad_norm": 3.442172050476074, - "learning_rate": 4.866927014064692e-06, - "loss": 0.3698, - "step": 194 - }, - { - "epoch": 1.054054054054054, - "grad_norm": 3.326539993286133, - "learning_rate": 4.86555697329841e-06, - "loss": 0.8468, - "step": 195 - }, - { - "epoch": 1.0594594594594595, - "grad_norm": 3.0372447967529297, - "learning_rate": 4.864180110879562e-06, - "loss": 0.8232, - "step": 196 - }, - { - "epoch": 1.0648648648648649, - "grad_norm": 2.955343008041382, - "learning_rate": 4.862796430778663e-06, - "loss": 0.4097, - "step": 197 - }, - { - "epoch": 1.0702702702702702, - "grad_norm": 2.4095399379730225, - "learning_rate": 4.861405936985889e-06, - "loss": 0.6746, - "step": 198 - }, - { - "epoch": 1.0756756756756758, - "grad_norm": 2.763500452041626, - "learning_rate": 4.860008633511059e-06, - "loss": 0.6605, - "step": 199 - }, - { - "epoch": 1.0810810810810811, - "grad_norm": 2.6751155853271484, - "learning_rate": 4.8586045243836384e-06, - "loss": 0.471, - "step": 200 - }, - { - "epoch": 1.0864864864864865, - "grad_norm": 3.3507862091064453, - "learning_rate": 4.857193613652711e-06, - "loss": 0.7665, - "step": 201 - }, - { - "epoch": 1.0918918918918918, - "grad_norm": 3.3064827919006348, - "learning_rate": 4.8557759053869775e-06, - "loss": 0.6436, - "step": 202 - }, - { - "epoch": 1.0972972972972972, - "grad_norm": 2.571828603744507, - "learning_rate": 4.854351403674741e-06, - "loss": 0.4642, - "step": 203 - }, - { - "epoch": 1.1027027027027028, - "grad_norm": 2.883220911026001, - "learning_rate": 4.852920112623895e-06, - "loss": 0.5737, - "step": 204 - }, - { - "epoch": 1.1081081081081081, - "grad_norm": 3.026144027709961, - "learning_rate": 4.851482036361912e-06, - "loss": 0.7302, - "step": 205 - }, - { - "epoch": 1.1135135135135135, - "grad_norm": 2.6689612865448, - "learning_rate": 4.850037179035829e-06, - "loss": 0.5229, - "step": 206 - }, - { - "epoch": 1.118918918918919, - "grad_norm": 2.4019956588745117, - "learning_rate": 4.8485855448122425e-06, - "loss": 0.5529, - "step": 207 - }, - { - "epoch": 1.1243243243243244, - "grad_norm": 2.3546230792999268, - "learning_rate": 4.847127137877286e-06, - "loss": 0.3635, - "step": 208 - }, - { - "epoch": 1.1297297297297297, - "grad_norm": 2.999096393585205, - "learning_rate": 4.8456619624366285e-06, - "loss": 0.8149, - "step": 209 - }, - { - "epoch": 1.135135135135135, - "grad_norm": 10.072900772094727, - "learning_rate": 4.844190022715456e-06, - "loss": 0.8333, - "step": 210 - }, - { - "epoch": 1.1405405405405404, - "grad_norm": 2.222123384475708, - "learning_rate": 4.84271132295846e-06, - "loss": 0.3717, - "step": 211 - }, - { - "epoch": 1.145945945945946, - "grad_norm": 2.8751113414764404, - "learning_rate": 4.841225867429826e-06, - "loss": 0.5994, - "step": 212 - }, - { - "epoch": 1.1513513513513514, - "grad_norm": 2.9580111503601074, - "learning_rate": 4.839733660413224e-06, - "loss": 0.8382, - "step": 213 - }, - { - "epoch": 1.1567567567567567, - "grad_norm": 4.628892421722412, - "learning_rate": 4.838234706211792e-06, - "loss": 0.818, - "step": 214 - }, - { - "epoch": 1.1621621621621623, - "grad_norm": 2.5103509426116943, - "learning_rate": 4.836729009148124e-06, - "loss": 0.4267, - "step": 215 - }, - { - "epoch": 1.1675675675675676, - "grad_norm": 2.6093738079071045, - "learning_rate": 4.835216573564261e-06, - "loss": 0.3472, - "step": 216 - }, - { - "epoch": 1.172972972972973, - "grad_norm": 3.0792338848114014, - "learning_rate": 4.833697403821672e-06, - "loss": 0.6323, - "step": 217 - }, - { - "epoch": 1.1783783783783783, - "grad_norm": 2.845163345336914, - "learning_rate": 4.8321715043012516e-06, - "loss": 0.6831, - "step": 218 - }, - { - "epoch": 1.1837837837837837, - "grad_norm": 3.0433948040008545, - "learning_rate": 4.830638879403296e-06, - "loss": 0.3682, - "step": 219 - }, - { - "epoch": 1.1891891891891893, - "grad_norm": 2.6533594131469727, - "learning_rate": 4.8290995335475e-06, - "loss": 0.4154, - "step": 220 - }, - { - "epoch": 1.1945945945945946, - "grad_norm": 2.9271352291107178, - "learning_rate": 4.827553471172935e-06, - "loss": 0.3991, - "step": 221 - }, - { - "epoch": 1.2, - "grad_norm": 2.9243528842926025, - "learning_rate": 4.826000696738045e-06, - "loss": 0.4538, - "step": 222 - }, - { - "epoch": 1.2054054054054055, - "grad_norm": 2.537332534790039, - "learning_rate": 4.824441214720629e-06, - "loss": 0.7692, - "step": 223 - }, - { - "epoch": 1.2108108108108109, - "grad_norm": 3.9193246364593506, - "learning_rate": 4.8228750296178275e-06, - "loss": 0.6038, - "step": 224 - }, - { - "epoch": 1.2162162162162162, - "grad_norm": 2.6646728515625, - "learning_rate": 4.821302145946113e-06, - "loss": 0.4147, - "step": 225 - }, - { - "epoch": 1.2216216216216216, - "grad_norm": 2.6519482135772705, - "learning_rate": 4.819722568241274e-06, - "loss": 0.5398, - "step": 226 - }, - { - "epoch": 1.227027027027027, - "grad_norm": 2.2018048763275146, - "learning_rate": 4.818136301058401e-06, - "loss": 0.3864, - "step": 227 - }, - { - "epoch": 1.2324324324324325, - "grad_norm": 2.5660712718963623, - "learning_rate": 4.816543348971879e-06, - "loss": 0.5712, - "step": 228 - }, - { - "epoch": 1.2378378378378379, - "grad_norm": 3.237663745880127, - "learning_rate": 4.814943716575368e-06, - "loss": 0.662, - "step": 229 - }, - { - "epoch": 1.2432432432432432, - "grad_norm": 2.5570430755615234, - "learning_rate": 4.813337408481793e-06, - "loss": 0.8661, - "step": 230 - }, - { - "epoch": 1.2486486486486488, - "grad_norm": 2.9231269359588623, - "learning_rate": 4.811724429323329e-06, - "loss": 0.9218, - "step": 231 - }, - { - "epoch": 1.2540540540540541, - "grad_norm": 3.637084722518921, - "learning_rate": 4.810104783751389e-06, - "loss": 0.5597, - "step": 232 - }, - { - "epoch": 1.2594594594594595, - "grad_norm": 3.0218842029571533, - "learning_rate": 4.8084784764366125e-06, - "loss": 0.4786, - "step": 233 - }, - { - "epoch": 1.2648648648648648, - "grad_norm": 2.770214080810547, - "learning_rate": 4.806845512068846e-06, - "loss": 0.5219, - "step": 234 - }, - { - "epoch": 1.2702702702702702, - "grad_norm": 3.093053102493286, - "learning_rate": 4.805205895357137e-06, - "loss": 0.643, - "step": 235 - }, - { - "epoch": 1.2756756756756757, - "grad_norm": 2.6373348236083984, - "learning_rate": 4.803559631029713e-06, - "loss": 0.5858, - "step": 236 - }, - { - "epoch": 1.281081081081081, - "grad_norm": 2.452030897140503, - "learning_rate": 4.801906723833973e-06, - "loss": 0.4185, - "step": 237 - }, - { - "epoch": 1.2864864864864864, - "grad_norm": 2.72564697265625, - "learning_rate": 4.8002471785364734e-06, - "loss": 0.4917, - "step": 238 - }, - { - "epoch": 1.291891891891892, - "grad_norm": 3.0389158725738525, - "learning_rate": 4.798580999922913e-06, - "loss": 0.645, - "step": 239 - }, - { - "epoch": 1.2972972972972974, - "grad_norm": 3.7002289295196533, - "learning_rate": 4.796908192798117e-06, - "loss": 0.5378, - "step": 240 - }, - { - "epoch": 1.3027027027027027, - "grad_norm": 2.1876111030578613, - "learning_rate": 4.7952287619860276e-06, - "loss": 0.5197, - "step": 241 - }, - { - "epoch": 1.308108108108108, - "grad_norm": 3.903337240219116, - "learning_rate": 4.793542712329689e-06, - "loss": 1.0226, - "step": 242 - }, - { - "epoch": 1.3135135135135134, - "grad_norm": 2.3623552322387695, - "learning_rate": 4.791850048691228e-06, - "loss": 0.5502, - "step": 243 - }, - { - "epoch": 1.318918918918919, - "grad_norm": 3.0669031143188477, - "learning_rate": 4.79015077595185e-06, - "loss": 0.6976, - "step": 244 - }, - { - "epoch": 1.3243243243243243, - "grad_norm": 3.1480472087860107, - "learning_rate": 4.788444899011816e-06, - "loss": 0.4795, - "step": 245 - }, - { - "epoch": 1.3297297297297297, - "grad_norm": 3.7051920890808105, - "learning_rate": 4.786732422790432e-06, - "loss": 0.6526, - "step": 246 - }, - { - "epoch": 1.3351351351351353, - "grad_norm": 3.4358389377593994, - "learning_rate": 4.785013352226036e-06, - "loss": 0.5551, - "step": 247 - }, - { - "epoch": 1.3405405405405406, - "grad_norm": 2.3789355754852295, - "learning_rate": 4.7832876922759805e-06, - "loss": 0.3151, - "step": 248 - }, - { - "epoch": 1.345945945945946, - "grad_norm": 2.4843716621398926, - "learning_rate": 4.781555447916622e-06, - "loss": 0.6713, - "step": 249 - }, - { - "epoch": 1.3513513513513513, - "grad_norm": 3.0176303386688232, - "learning_rate": 4.779816624143302e-06, - "loss": 0.437, - "step": 250 - }, - { - "epoch": 1.3567567567567567, - "grad_norm": 2.868350028991699, - "learning_rate": 4.77807122597034e-06, - "loss": 0.7632, - "step": 251 - }, - { - "epoch": 1.3621621621621622, - "grad_norm": 2.4629738330841064, - "learning_rate": 4.776319258431009e-06, - "loss": 0.4894, - "step": 252 - }, - { - "epoch": 1.3675675675675676, - "grad_norm": 2.798297882080078, - "learning_rate": 4.77456072657753e-06, - "loss": 0.4456, - "step": 253 - }, - { - "epoch": 1.372972972972973, - "grad_norm": 3.2977547645568848, - "learning_rate": 4.772795635481053e-06, - "loss": 0.5381, - "step": 254 - }, - { - "epoch": 1.3783783783783785, - "grad_norm": 4.1061906814575195, - "learning_rate": 4.77102399023164e-06, - "loss": 1.0302, - "step": 255 - }, - { - "epoch": 1.3837837837837839, - "grad_norm": 3.943284511566162, - "learning_rate": 4.769245795938261e-06, - "loss": 0.4875, - "step": 256 - }, - { - "epoch": 1.3891891891891892, - "grad_norm": 2.6420533657073975, - "learning_rate": 4.767461057728763e-06, - "loss": 0.4923, - "step": 257 - }, - { - "epoch": 1.3945945945945946, - "grad_norm": 3.3152263164520264, - "learning_rate": 4.76566978074987e-06, - "loss": 0.6699, - "step": 258 - }, - { - "epoch": 1.4, - "grad_norm": 2.6928882598876953, - "learning_rate": 4.7638719701671586e-06, - "loss": 0.6117, - "step": 259 - }, - { - "epoch": 1.4054054054054055, - "grad_norm": 2.706597328186035, - "learning_rate": 4.762067631165049e-06, - "loss": 0.8534, - "step": 260 - }, - { - "epoch": 1.4108108108108108, - "grad_norm": 2.9912848472595215, - "learning_rate": 4.760256768946787e-06, - "loss": 0.5057, - "step": 261 - }, - { - "epoch": 1.4162162162162162, - "grad_norm": 2.7098443508148193, - "learning_rate": 4.758439388734429e-06, - "loss": 0.7286, - "step": 262 - }, - { - "epoch": 1.4216216216216218, - "grad_norm": 3.1288092136383057, - "learning_rate": 4.7566154957688276e-06, - "loss": 0.9827, - "step": 263 - }, - { - "epoch": 1.427027027027027, - "grad_norm": 3.0505919456481934, - "learning_rate": 4.754785095309617e-06, - "loss": 0.7042, - "step": 264 - }, - { - "epoch": 1.4324324324324325, - "grad_norm": 2.6800339221954346, - "learning_rate": 4.752948192635199e-06, - "loss": 0.5179, - "step": 265 - }, - { - "epoch": 1.4378378378378378, - "grad_norm": 2.2246861457824707, - "learning_rate": 4.751104793042722e-06, - "loss": 0.8527, - "step": 266 - }, - { - "epoch": 1.4432432432432432, - "grad_norm": 2.4242751598358154, - "learning_rate": 4.7492549018480725e-06, - "loss": 0.5627, - "step": 267 - }, - { - "epoch": 1.4486486486486487, - "grad_norm": 2.763244152069092, - "learning_rate": 4.747398524385858e-06, - "loss": 0.8981, - "step": 268 - }, - { - "epoch": 1.454054054054054, - "grad_norm": 2.856595993041992, - "learning_rate": 4.745535666009389e-06, - "loss": 0.5455, - "step": 269 - }, - { - "epoch": 1.4594594594594594, - "grad_norm": 2.4168624877929688, - "learning_rate": 4.743666332090664e-06, - "loss": 0.4348, - "step": 270 - }, - { - "epoch": 1.464864864864865, - "grad_norm": 2.5408060550689697, - "learning_rate": 4.74179052802036e-06, - "loss": 0.5524, - "step": 271 - }, - { - "epoch": 1.4702702702702704, - "grad_norm": 2.6216673851013184, - "learning_rate": 4.739908259207807e-06, - "loss": 0.7469, - "step": 272 - }, - { - "epoch": 1.4756756756756757, - "grad_norm": 5.397300720214844, - "learning_rate": 4.738019531080981e-06, - "loss": 0.7216, - "step": 273 - }, - { - "epoch": 1.481081081081081, - "grad_norm": 3.3481080532073975, - "learning_rate": 4.7361243490864825e-06, - "loss": 0.7527, - "step": 274 - }, - { - "epoch": 1.4864864864864864, - "grad_norm": 2.7943873405456543, - "learning_rate": 4.734222718689527e-06, - "loss": 0.7437, - "step": 275 - }, - { - "epoch": 1.491891891891892, - "grad_norm": 2.206890344619751, - "learning_rate": 4.732314645373922e-06, - "loss": 0.5187, - "step": 276 - }, - { - "epoch": 1.4972972972972973, - "grad_norm": 2.76442813873291, - "learning_rate": 4.730400134642055e-06, - "loss": 0.7186, - "step": 277 - }, - { - "epoch": 1.5027027027027027, - "grad_norm": 3.4754087924957275, - "learning_rate": 4.728479192014879e-06, - "loss": 0.9655, - "step": 278 - }, - { - "epoch": 1.5081081081081082, - "grad_norm": 2.923779249191284, - "learning_rate": 4.726551823031895e-06, - "loss": 0.6251, - "step": 279 - }, - { - "epoch": 1.5135135135135136, - "grad_norm": 3.1142773628234863, - "learning_rate": 4.7246180332511335e-06, - "loss": 0.4805, - "step": 280 - }, - { - "epoch": 1.518918918918919, - "grad_norm": 2.3477070331573486, - "learning_rate": 4.722677828249142e-06, - "loss": 1.0939, - "step": 281 - }, - { - "epoch": 1.5243243243243243, - "grad_norm": 2.8418569564819336, - "learning_rate": 4.720731213620972e-06, - "loss": 0.9485, - "step": 282 - }, - { - "epoch": 1.5297297297297296, - "grad_norm": 2.462710380554199, - "learning_rate": 4.718778194980152e-06, - "loss": 0.5805, - "step": 283 - }, - { - "epoch": 1.535135135135135, - "grad_norm": 3.2379209995269775, - "learning_rate": 4.7168187779586805e-06, - "loss": 0.77, - "step": 284 - }, - { - "epoch": 1.5405405405405406, - "grad_norm": 3.0701661109924316, - "learning_rate": 4.71485296820701e-06, - "loss": 0.5932, - "step": 285 - }, - { - "epoch": 1.545945945945946, - "grad_norm": 4.099547386169434, - "learning_rate": 4.7128807713940245e-06, - "loss": 0.6296, - "step": 286 - }, - { - "epoch": 1.5513513513513515, - "grad_norm": 2.5529167652130127, - "learning_rate": 4.710902193207028e-06, - "loss": 0.6201, - "step": 287 - }, - { - "epoch": 1.5567567567567568, - "grad_norm": 2.794926881790161, - "learning_rate": 4.708917239351727e-06, - "loss": 0.5682, - "step": 288 - }, - { - "epoch": 1.5621621621621622, - "grad_norm": 3.2522501945495605, - "learning_rate": 4.706925915552214e-06, - "loss": 0.8877, - "step": 289 - }, - { - "epoch": 1.5675675675675675, - "grad_norm": 2.811847448348999, - "learning_rate": 4.704928227550949e-06, - "loss": 0.6521, - "step": 290 - }, - { - "epoch": 1.572972972972973, - "grad_norm": 2.7060673236846924, - "learning_rate": 4.702924181108745e-06, - "loss": 0.4929, - "step": 291 - }, - { - "epoch": 1.5783783783783782, - "grad_norm": 2.5009031295776367, - "learning_rate": 4.700913782004755e-06, - "loss": 0.4515, - "step": 292 - }, - { - "epoch": 1.5837837837837838, - "grad_norm": 2.6722700595855713, - "learning_rate": 4.698897036036446e-06, - "loss": 0.5477, - "step": 293 - }, - { - "epoch": 1.5891891891891892, - "grad_norm": 3.3333957195281982, - "learning_rate": 4.696873949019591e-06, - "loss": 0.9589, - "step": 294 - }, - { - "epoch": 1.5945945945945947, - "grad_norm": 2.4862897396087646, - "learning_rate": 4.694844526788248e-06, - "loss": 0.4425, - "step": 295 - }, - { - "epoch": 1.6, - "grad_norm": 2.78708553314209, - "learning_rate": 4.692808775194745e-06, - "loss": 0.4899, - "step": 296 - }, - { - "epoch": 1.6054054054054054, - "grad_norm": 2.9121289253234863, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4884, - "step": 297 - }, - { - "epoch": 1.6108108108108108, - "grad_norm": 4.692054271697998, - "learning_rate": 4.688718307421807e-06, - "loss": 0.8977, - "step": 298 - }, - { - "epoch": 1.6162162162162161, - "grad_norm": 3.1290926933288574, - "learning_rate": 4.686663603038222e-06, - "loss": 0.6833, - "step": 299 - }, - { - "epoch": 1.6216216216216215, - "grad_norm": 3.5091123580932617, - "learning_rate": 4.6846025928841365e-06, - "loss": 0.9141, - "step": 300 - }, - { - "epoch": 1.627027027027027, - "grad_norm": 2.5466184616088867, - "learning_rate": 4.6825352829029705e-06, - "loss": 0.5121, - "step": 301 - }, - { - "epoch": 1.6324324324324324, - "grad_norm": 2.7833092212677, - "learning_rate": 4.68046167905631e-06, - "loss": 0.5399, - "step": 302 - }, - { - "epoch": 1.637837837837838, - "grad_norm": 3.05135440826416, - "learning_rate": 4.678381787323889e-06, - "loss": 0.7921, - "step": 303 - }, - { - "epoch": 1.6432432432432433, - "grad_norm": 2.2391726970672607, - "learning_rate": 4.676295613703577e-06, - "loss": 0.7178, - "step": 304 - }, - { - "epoch": 1.6486486486486487, - "grad_norm": 2.3654022216796875, - "learning_rate": 4.674203164211357e-06, - "loss": 0.7162, - "step": 305 - }, - { - "epoch": 1.654054054054054, - "grad_norm": 2.436009645462036, - "learning_rate": 4.67210444488131e-06, - "loss": 0.6539, - "step": 306 - }, - { - "epoch": 1.6594594594594594, - "grad_norm": 2.6034209728240967, - "learning_rate": 4.669999461765599e-06, - "loss": 0.7214, - "step": 307 - }, - { - "epoch": 1.6648648648648647, - "grad_norm": 2.804229497909546, - "learning_rate": 4.6678882209344474e-06, - "loss": 0.7451, - "step": 308 - }, - { - "epoch": 1.6702702702702703, - "grad_norm": 2.6239655017852783, - "learning_rate": 4.665770728476127e-06, - "loss": 0.6464, - "step": 309 - }, - { - "epoch": 1.6756756756756757, - "grad_norm": 2.9320099353790283, - "learning_rate": 4.663646990496939e-06, - "loss": 0.6669, - "step": 310 - }, - { - "epoch": 1.6810810810810812, - "grad_norm": 3.09713077545166, - "learning_rate": 4.661517013121189e-06, - "loss": 0.8972, - "step": 311 - }, - { - "epoch": 1.6864864864864866, - "grad_norm": 3.6576132774353027, - "learning_rate": 4.659380802491181e-06, - "loss": 0.6286, - "step": 312 - }, - { - "epoch": 1.691891891891892, - "grad_norm": 2.9320433139801025, - "learning_rate": 4.6572383647671915e-06, - "loss": 0.3631, - "step": 313 - }, - { - "epoch": 1.6972972972972973, - "grad_norm": 3.399357557296753, - "learning_rate": 4.655089706127457e-06, - "loss": 0.5682, - "step": 314 - }, - { - "epoch": 1.7027027027027026, - "grad_norm": 2.7667412757873535, - "learning_rate": 4.652934832768148e-06, - "loss": 0.5457, - "step": 315 - }, - { - "epoch": 1.708108108108108, - "grad_norm": 2.3023321628570557, - "learning_rate": 4.650773750903363e-06, - "loss": 0.6601, - "step": 316 - }, - { - "epoch": 1.7135135135135136, - "grad_norm": 2.6584670543670654, - "learning_rate": 4.6486064667651005e-06, - "loss": 0.5882, - "step": 317 - }, - { - "epoch": 1.718918918918919, - "grad_norm": 5.528168678283691, - "learning_rate": 4.646432986603245e-06, - "loss": 0.7628, - "step": 318 - }, - { - "epoch": 1.7243243243243245, - "grad_norm": 3.054884195327759, - "learning_rate": 4.644253316685552e-06, - "loss": 0.6877, - "step": 319 - }, - { - "epoch": 1.7297297297297298, - "grad_norm": 3.2672388553619385, - "learning_rate": 4.6420674632976205e-06, - "loss": 0.7026, - "step": 320 - }, - { - "epoch": 1.7351351351351352, - "grad_norm": 3.109384536743164, - "learning_rate": 4.639875432742886e-06, - "loss": 0.5236, - "step": 321 - }, - { - "epoch": 1.7405405405405405, - "grad_norm": 3.3593883514404297, - "learning_rate": 4.6376772313425975e-06, - "loss": 0.6463, - "step": 322 - }, - { - "epoch": 1.7459459459459459, - "grad_norm": 2.6352698802948, - "learning_rate": 4.635472865435795e-06, - "loss": 0.6903, - "step": 323 - }, - { - "epoch": 1.7513513513513512, - "grad_norm": 2.751690149307251, - "learning_rate": 4.6332623413792995e-06, - "loss": 0.7342, - "step": 324 - }, - { - "epoch": 1.7567567567567568, - "grad_norm": 2.670915126800537, - "learning_rate": 4.6310456655476874e-06, - "loss": 0.4302, - "step": 325 - }, - { - "epoch": 1.7621621621621621, - "grad_norm": 2.7648138999938965, - "learning_rate": 4.6288228443332786e-06, - "loss": 0.5108, - "step": 326 - }, - { - "epoch": 1.7675675675675677, - "grad_norm": 2.7451536655426025, - "learning_rate": 4.626593884146111e-06, - "loss": 0.7646, - "step": 327 - }, - { - "epoch": 1.772972972972973, - "grad_norm": 2.4656403064727783, - "learning_rate": 4.624358791413928e-06, - "loss": 0.5529, - "step": 328 - }, - { - "epoch": 1.7783783783783784, - "grad_norm": 2.5987517833709717, - "learning_rate": 4.622117572582159e-06, - "loss": 0.609, - "step": 329 - }, - { - "epoch": 1.7837837837837838, - "grad_norm": 3.3843371868133545, - "learning_rate": 4.619870234113894e-06, - "loss": 0.9146, - "step": 330 - }, - { - "epoch": 1.7891891891891891, - "grad_norm": 2.3542068004608154, - "learning_rate": 4.617616782489878e-06, - "loss": 0.6887, - "step": 331 - }, - { - "epoch": 1.7945945945945945, - "grad_norm": 2.2049715518951416, - "learning_rate": 4.615357224208477e-06, - "loss": 0.505, - "step": 332 - }, - { - "epoch": 1.8, - "grad_norm": 2.453920364379883, - "learning_rate": 4.613091565785674e-06, - "loss": 0.8384, - "step": 333 - }, - { - "epoch": 1.8054054054054054, - "grad_norm": 2.5751583576202393, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5512, - "step": 334 - }, - { - "epoch": 1.810810810810811, - "grad_norm": 2.524075984954834, - "learning_rate": 4.608541974667714e-06, - "loss": 0.4877, - "step": 335 - }, - { - "epoch": 1.8162162162162163, - "grad_norm": 2.2856955528259277, - "learning_rate": 4.606258055092397e-06, - "loss": 0.5583, - "step": 336 - }, - { - "epoch": 1.8216216216216217, - "grad_norm": 2.2773683071136475, - "learning_rate": 4.603968061615321e-06, - "loss": 0.5421, - "step": 337 - }, - { - "epoch": 1.827027027027027, - "grad_norm": 4.085512161254883, - "learning_rate": 4.601672000840231e-06, - "loss": 0.942, - "step": 338 - }, - { - "epoch": 1.8324324324324324, - "grad_norm": 2.3710968494415283, - "learning_rate": 4.5993698793883715e-06, - "loss": 0.3773, - "step": 339 - }, - { - "epoch": 1.8378378378378377, - "grad_norm": 2.745534658432007, - "learning_rate": 4.597061703898462e-06, - "loss": 0.9694, - "step": 340 - }, - { - "epoch": 1.8432432432432433, - "grad_norm": 2.463207244873047, - "learning_rate": 4.594747481026685e-06, - "loss": 0.4667, - "step": 341 - }, - { - "epoch": 1.8486486486486486, - "grad_norm": 2.7216601371765137, - "learning_rate": 4.592427217446656e-06, - "loss": 0.4267, - "step": 342 - }, - { - "epoch": 1.8540540540540542, - "grad_norm": 2.545664072036743, - "learning_rate": 4.590100919849413e-06, - "loss": 0.9245, - "step": 343 - }, - { - "epoch": 1.8594594594594596, - "grad_norm": 3.692840337753296, - "learning_rate": 4.587768594943396e-06, - "loss": 0.7502, - "step": 344 - }, - { - "epoch": 1.864864864864865, - "grad_norm": 2.993229627609253, - "learning_rate": 4.585430249454426e-06, - "loss": 0.4689, - "step": 345 - }, - { - "epoch": 1.8702702702702703, - "grad_norm": 2.162867546081543, - "learning_rate": 4.583085890125682e-06, - "loss": 0.6188, - "step": 346 - }, - { - "epoch": 1.8756756756756756, - "grad_norm": 2.2169792652130127, - "learning_rate": 4.5807355237176896e-06, - "loss": 0.6352, - "step": 347 - }, - { - "epoch": 1.881081081081081, - "grad_norm": 3.978985548019409, - "learning_rate": 4.578379157008296e-06, - "loss": 0.464, - "step": 348 - }, - { - "epoch": 1.8864864864864865, - "grad_norm": 2.236682653427124, - "learning_rate": 4.57601679679265e-06, - "loss": 0.5943, - "step": 349 - }, - { - "epoch": 1.8918918918918919, - "grad_norm": 2.528754472732544, - "learning_rate": 4.573648449883188e-06, - "loss": 0.6949, - "step": 350 - }, - { - "epoch": 1.8972972972972975, - "grad_norm": 2.7673721313476562, - "learning_rate": 4.571274123109606e-06, - "loss": 0.4333, - "step": 351 - }, - { - "epoch": 1.9027027027027028, - "grad_norm": 2.698012351989746, - "learning_rate": 4.568893823318847e-06, - "loss": 0.6796, - "step": 352 - }, - { - "epoch": 1.9081081081081082, - "grad_norm": 2.9640560150146484, - "learning_rate": 4.566507557375077e-06, - "loss": 0.6139, - "step": 353 - }, - { - "epoch": 1.9135135135135135, - "grad_norm": 2.417628526687622, - "learning_rate": 4.5641153321596684e-06, - "loss": 0.4515, - "step": 354 - }, - { - "epoch": 1.9189189189189189, - "grad_norm": 2.676739454269409, - "learning_rate": 4.56171715457118e-06, - "loss": 0.8426, - "step": 355 - }, - { - "epoch": 1.9243243243243242, - "grad_norm": 2.8428189754486084, - "learning_rate": 4.559313031525331e-06, - "loss": 0.5806, - "step": 356 - }, - { - "epoch": 1.9297297297297298, - "grad_norm": 2.6817944049835205, - "learning_rate": 4.55690296995499e-06, - "loss": 0.5927, - "step": 357 - }, - { - "epoch": 1.9351351351351351, - "grad_norm": 3.5939931869506836, - "learning_rate": 4.554486976810149e-06, - "loss": 0.9986, - "step": 358 - }, - { - "epoch": 1.9405405405405407, - "grad_norm": 2.86688494682312, - "learning_rate": 4.552065059057906e-06, - "loss": 0.6813, - "step": 359 - }, - { - "epoch": 1.945945945945946, - "grad_norm": 2.9295246601104736, - "learning_rate": 4.549637223682441e-06, - "loss": 1.0832, - "step": 360 - }, - { - "epoch": 1.9513513513513514, - "grad_norm": 2.6939451694488525, - "learning_rate": 4.547203477685005e-06, - "loss": 0.7377, - "step": 361 - }, - { - "epoch": 1.9567567567567568, - "grad_norm": 2.226055145263672, - "learning_rate": 4.544763828083888e-06, - "loss": 0.5412, - "step": 362 - }, - { - "epoch": 1.962162162162162, - "grad_norm": 2.490187406539917, - "learning_rate": 4.542318281914405e-06, - "loss": 0.6955, - "step": 363 - }, - { - "epoch": 1.9675675675675675, - "grad_norm": 2.9241302013397217, - "learning_rate": 4.53986684622888e-06, - "loss": 0.6774, - "step": 364 - }, - { - "epoch": 1.972972972972973, - "grad_norm": 2.988084554672241, - "learning_rate": 4.537409528096615e-06, - "loss": 0.5832, - "step": 365 - }, - { - "epoch": 1.9783783783783784, - "grad_norm": 2.9380626678466797, - "learning_rate": 4.534946334603879e-06, - "loss": 0.606, - "step": 366 - }, - { - "epoch": 1.983783783783784, - "grad_norm": 2.667588710784912, - "learning_rate": 4.532477272853882e-06, - "loss": 0.4991, - "step": 367 - }, - { - "epoch": 1.9891891891891893, - "grad_norm": 2.9711899757385254, - "learning_rate": 4.530002349966759e-06, - "loss": 0.4442, - "step": 368 - }, - { - "epoch": 1.9945945945945946, - "grad_norm": 3.443957805633545, - "learning_rate": 4.5275215730795445e-06, - "loss": 0.6566, - "step": 369 - }, - { - "epoch": 2.0, - "grad_norm": 3.590317487716675, - "learning_rate": 4.525034949346156e-06, - "loss": 0.5687, - "step": 370 - }, - { - "epoch": 2.0054054054054054, - "grad_norm": 3.678600549697876, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4458, - "step": 371 - }, - { - "epoch": 2.0108108108108107, - "grad_norm": 3.803563356399536, - "learning_rate": 4.5200441900408045e-06, - "loss": 0.4418, - "step": 372 - }, - { - "epoch": 2.016216216216216, - "grad_norm": 2.9187233448028564, - "learning_rate": 4.517540068860898e-06, - "loss": 0.7057, - "step": 373 - }, - { - "epoch": 2.0216216216216214, - "grad_norm": 2.693603515625, - "learning_rate": 4.515030129618884e-06, - "loss": 0.4491, - "step": 374 - }, - { - "epoch": 2.027027027027027, - "grad_norm": 2.3883047103881836, - "learning_rate": 4.512514379552779e-06, - "loss": 0.3571, - "step": 375 - }, - { - "epoch": 2.0324324324324325, - "grad_norm": 4.558557033538818, - "learning_rate": 4.509992825917352e-06, - "loss": 0.5056, - "step": 376 - }, - { - "epoch": 2.037837837837838, - "grad_norm": 3.9574761390686035, - "learning_rate": 4.507465475984109e-06, - "loss": 0.6834, - "step": 377 - }, - { - "epoch": 2.0432432432432432, - "grad_norm": 5.34630012512207, - "learning_rate": 4.504932337041272e-06, - "loss": 0.6726, - "step": 378 - }, - { - "epoch": 2.0486486486486486, - "grad_norm": 3.198740243911743, - "learning_rate": 4.502393416393757e-06, - "loss": 0.4032, - "step": 379 - }, - { - "epoch": 2.054054054054054, - "grad_norm": 3.347480297088623, - "learning_rate": 4.4998487213631515e-06, - "loss": 0.5442, - "step": 380 - }, - { - "epoch": 2.0594594594594593, - "grad_norm": 3.940531015396118, - "learning_rate": 4.497298259287696e-06, - "loss": 0.6181, - "step": 381 - }, - { - "epoch": 2.064864864864865, - "grad_norm": 3.0910496711730957, - "learning_rate": 4.494742037522261e-06, - "loss": 0.3829, - "step": 382 - }, - { - "epoch": 2.0702702702702704, - "grad_norm": 4.060451984405518, - "learning_rate": 4.4921800634383295e-06, - "loss": 0.4953, - "step": 383 - }, - { - "epoch": 2.075675675675676, - "grad_norm": 3.1667511463165283, - "learning_rate": 4.4896123444239655e-06, - "loss": 0.3254, - "step": 384 - }, - { - "epoch": 2.081081081081081, - "grad_norm": 3.0239670276641846, - "learning_rate": 4.487038887883809e-06, - "loss": 0.555, - "step": 385 - }, - { - "epoch": 2.0864864864864865, - "grad_norm": 2.8815383911132812, - "learning_rate": 4.484459701239038e-06, - "loss": 0.665, - "step": 386 - }, - { - "epoch": 2.091891891891892, - "grad_norm": 3.615537166595459, - "learning_rate": 4.481874791927358e-06, - "loss": 0.2652, - "step": 387 - }, - { - "epoch": 2.097297297297297, - "grad_norm": 3.407407283782959, - "learning_rate": 4.479284167402977e-06, - "loss": 0.3811, - "step": 388 - }, - { - "epoch": 2.1027027027027025, - "grad_norm": 2.6651623249053955, - "learning_rate": 4.476687835136585e-06, - "loss": 0.2463, - "step": 389 - }, - { - "epoch": 2.108108108108108, - "grad_norm": 3.5145862102508545, - "learning_rate": 4.47408580261533e-06, - "loss": 0.5507, - "step": 390 - }, - { - "epoch": 2.1135135135135137, - "grad_norm": 3.0952725410461426, - "learning_rate": 4.471478077342798e-06, - "loss": 0.288, - "step": 391 - }, - { - "epoch": 2.118918918918919, - "grad_norm": 2.634775400161743, - "learning_rate": 4.468864666838994e-06, - "loss": 0.5169, - "step": 392 - }, - { - "epoch": 2.1243243243243244, - "grad_norm": 3.7388594150543213, - "learning_rate": 4.4662455786403125e-06, - "loss": 0.3327, - "step": 393 - }, - { - "epoch": 2.1297297297297297, - "grad_norm": 3.8197360038757324, - "learning_rate": 4.463620820299528e-06, - "loss": 0.3877, - "step": 394 - }, - { - "epoch": 2.135135135135135, - "grad_norm": 3.0073485374450684, - "learning_rate": 4.4609903993857606e-06, - "loss": 0.5425, - "step": 395 - }, - { - "epoch": 2.1405405405405404, - "grad_norm": 2.6923868656158447, - "learning_rate": 4.458354323484462e-06, - "loss": 0.5257, - "step": 396 - }, - { - "epoch": 2.145945945945946, - "grad_norm": 3.2151331901550293, - "learning_rate": 4.45571260019739e-06, - "loss": 0.3914, - "step": 397 - }, - { - "epoch": 2.1513513513513516, - "grad_norm": 3.4031248092651367, - "learning_rate": 4.453065237142592e-06, - "loss": 0.3455, - "step": 398 - }, - { - "epoch": 2.156756756756757, - "grad_norm": 3.012275457382202, - "learning_rate": 4.4504122419543745e-06, - "loss": 0.4652, - "step": 399 - }, - { - "epoch": 2.1621621621621623, - "grad_norm": 3.3084208965301514, - "learning_rate": 4.4477536222832865e-06, - "loss": 0.6343, - "step": 400 - }, - { - "epoch": 2.1675675675675676, - "grad_norm": 3.115206241607666, - "learning_rate": 4.445089385796099e-06, - "loss": 0.6975, - "step": 401 - }, - { - "epoch": 2.172972972972973, - "grad_norm": 2.893930435180664, - "learning_rate": 4.442419540175778e-06, - "loss": 0.5779, - "step": 402 - }, - { - "epoch": 2.1783783783783783, - "grad_norm": 3.0549168586730957, - "learning_rate": 4.439744093121465e-06, - "loss": 0.4541, - "step": 403 - }, - { - "epoch": 2.1837837837837837, - "grad_norm": 3.1189024448394775, - "learning_rate": 4.437063052348457e-06, - "loss": 0.4078, - "step": 404 - }, - { - "epoch": 2.189189189189189, - "grad_norm": 6.644659042358398, - "learning_rate": 4.434376425588179e-06, - "loss": 0.6759, - "step": 405 - }, - { - "epoch": 2.1945945945945944, - "grad_norm": 2.807554006576538, - "learning_rate": 4.431684220588163e-06, - "loss": 0.2938, - "step": 406 - }, - { - "epoch": 2.2, - "grad_norm": 3.6900999546051025, - "learning_rate": 4.428986445112034e-06, - "loss": 0.676, - "step": 407 - }, - { - "epoch": 2.2054054054054055, - "grad_norm": 2.0721664428710938, - "learning_rate": 4.426283106939474e-06, - "loss": 0.1859, - "step": 408 - }, - { - "epoch": 2.210810810810811, - "grad_norm": 2.953388214111328, - "learning_rate": 4.423574213866209e-06, - "loss": 0.2955, - "step": 409 - }, - { - "epoch": 2.2162162162162162, - "grad_norm": 3.049050807952881, - "learning_rate": 4.420859773703985e-06, - "loss": 0.2262, - "step": 410 - }, - { - "epoch": 2.2216216216216216, - "grad_norm": 3.319796323776245, - "learning_rate": 4.418139794280542e-06, - "loss": 0.2273, - "step": 411 - }, - { - "epoch": 2.227027027027027, - "grad_norm": 2.4133522510528564, - "learning_rate": 4.415414283439595e-06, - "loss": 0.3282, - "step": 412 - }, - { - "epoch": 2.2324324324324323, - "grad_norm": 2.9842193126678467, - "learning_rate": 4.4126832490408116e-06, - "loss": 0.3651, - "step": 413 - }, - { - "epoch": 2.237837837837838, - "grad_norm": 2.759531259536743, - "learning_rate": 4.409946698959784e-06, - "loss": 0.4052, - "step": 414 - }, - { - "epoch": 2.2432432432432434, - "grad_norm": 3.045485019683838, - "learning_rate": 4.4072046410880145e-06, - "loss": 0.4638, - "step": 415 - }, - { - "epoch": 2.2486486486486488, - "grad_norm": 3.0058295726776123, - "learning_rate": 4.404457083332887e-06, - "loss": 0.517, - "step": 416 - }, - { - "epoch": 2.254054054054054, - "grad_norm": 3.025688409805298, - "learning_rate": 4.401704033617643e-06, - "loss": 0.6902, - "step": 417 - }, - { - "epoch": 2.2594594594594595, - "grad_norm": 3.3047802448272705, - "learning_rate": 4.398945499881366e-06, - "loss": 0.3552, - "step": 418 - }, - { - "epoch": 2.264864864864865, - "grad_norm": 3.0683655738830566, - "learning_rate": 4.396181490078949e-06, - "loss": 0.286, - "step": 419 - }, - { - "epoch": 2.27027027027027, - "grad_norm": 3.627681016921997, - "learning_rate": 4.393412012181082e-06, - "loss": 0.4036, - "step": 420 - }, - { - "epoch": 2.2756756756756755, - "grad_norm": 4.552238941192627, - "learning_rate": 4.390637074174219e-06, - "loss": 0.8037, - "step": 421 - }, - { - "epoch": 2.281081081081081, - "grad_norm": 2.8688855171203613, - "learning_rate": 4.387856684060561e-06, - "loss": 0.2553, - "step": 422 - }, - { - "epoch": 2.2864864864864867, - "grad_norm": 4.21850061416626, - "learning_rate": 4.385070849858033e-06, - "loss": 0.6222, - "step": 423 - }, - { - "epoch": 2.291891891891892, - "grad_norm": 3.038433790206909, - "learning_rate": 4.382279579600257e-06, - "loss": 0.5326, - "step": 424 - }, - { - "epoch": 2.2972972972972974, - "grad_norm": 3.297300338745117, - "learning_rate": 4.379482881336532e-06, - "loss": 0.5515, - "step": 425 - }, - { - "epoch": 2.3027027027027027, - "grad_norm": 7.162952423095703, - "learning_rate": 4.376680763131811e-06, - "loss": 0.6948, - "step": 426 - }, - { - "epoch": 2.308108108108108, - "grad_norm": 3.2403595447540283, - "learning_rate": 4.373873233066676e-06, - "loss": 0.2947, - "step": 427 - }, - { - "epoch": 2.3135135135135134, - "grad_norm": 3.2969906330108643, - "learning_rate": 4.371060299237315e-06, - "loss": 0.2261, - "step": 428 - }, - { - "epoch": 2.3189189189189188, - "grad_norm": 2.669058322906494, - "learning_rate": 4.368241969755499e-06, - "loss": 0.5398, - "step": 429 - }, - { - "epoch": 2.3243243243243246, - "grad_norm": 2.7643518447875977, - "learning_rate": 4.36541825274856e-06, - "loss": 0.3301, - "step": 430 - }, - { - "epoch": 2.32972972972973, - "grad_norm": 3.6037657260894775, - "learning_rate": 4.3625891563593635e-06, - "loss": 0.6064, - "step": 431 - }, - { - "epoch": 2.3351351351351353, - "grad_norm": 2.8805618286132812, - "learning_rate": 4.35975468874629e-06, - "loss": 0.3897, - "step": 432 - }, - { - "epoch": 2.3405405405405406, - "grad_norm": 2.642402172088623, - "learning_rate": 4.356914858083211e-06, - "loss": 0.271, - "step": 433 - }, - { - "epoch": 2.345945945945946, - "grad_norm": 2.916337490081787, - "learning_rate": 4.354069672559458e-06, - "loss": 0.3681, - "step": 434 - }, - { - "epoch": 2.3513513513513513, - "grad_norm": 3.3312325477600098, - "learning_rate": 4.35121914037981e-06, - "loss": 0.298, - "step": 435 - }, - { - "epoch": 2.3567567567567567, - "grad_norm": 2.980583906173706, - "learning_rate": 4.348363269764462e-06, - "loss": 0.3618, - "step": 436 - }, - { - "epoch": 2.362162162162162, - "grad_norm": 3.5010197162628174, - "learning_rate": 4.345502068949003e-06, - "loss": 0.8972, - "step": 437 - }, - { - "epoch": 2.3675675675675674, - "grad_norm": 2.7187814712524414, - "learning_rate": 4.342635546184394e-06, - "loss": 0.3939, - "step": 438 - }, - { - "epoch": 2.372972972972973, - "grad_norm": 2.8368170261383057, - "learning_rate": 4.339763709736944e-06, - "loss": 0.5462, - "step": 439 - }, - { - "epoch": 2.3783783783783785, - "grad_norm": 2.6989636421203613, - "learning_rate": 4.336886567888283e-06, - "loss": 0.5932, - "step": 440 - }, - { - "epoch": 2.383783783783784, - "grad_norm": 3.2514829635620117, - "learning_rate": 4.334004128935342e-06, - "loss": 0.4622, - "step": 441 - }, - { - "epoch": 2.389189189189189, - "grad_norm": 5.242766857147217, - "learning_rate": 4.331116401190327e-06, - "loss": 0.5997, - "step": 442 - }, - { - "epoch": 2.3945945945945946, - "grad_norm": 3.492724657058716, - "learning_rate": 4.328223392980696e-06, - "loss": 0.3072, - "step": 443 - }, - { - "epoch": 2.4, - "grad_norm": 4.074132442474365, - "learning_rate": 4.325325112649134e-06, - "loss": 0.5338, - "step": 444 - }, - { - "epoch": 2.4054054054054053, - "grad_norm": 2.7208468914031982, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3266, - "step": 445 - }, - { - "epoch": 2.410810810810811, - "grad_norm": 2.929180383682251, - "learning_rate": 4.3195127690669494e-06, - "loss": 0.4064, - "step": 446 - }, - { - "epoch": 2.4162162162162164, - "grad_norm": 2.848353624343872, - "learning_rate": 4.3165987225776186e-06, - "loss": 0.3856, - "step": 447 - }, - { - "epoch": 2.4216216216216218, - "grad_norm": 3.946488618850708, - "learning_rate": 4.313679437488889e-06, - "loss": 0.4261, - "step": 448 - }, - { - "epoch": 2.427027027027027, - "grad_norm": 5.781888961791992, - "learning_rate": 4.310754922219223e-06, - "loss": 0.4943, - "step": 449 - }, - { - "epoch": 2.4324324324324325, - "grad_norm": 2.8406941890716553, - "learning_rate": 4.307825185202164e-06, - "loss": 0.2874, - "step": 450 - }, - { - "epoch": 2.437837837837838, - "grad_norm": 3.2017335891723633, - "learning_rate": 4.3048902348863116e-06, - "loss": 0.4218, - "step": 451 - }, - { - "epoch": 2.443243243243243, - "grad_norm": 3.8355906009674072, - "learning_rate": 4.301950079735303e-06, - "loss": 0.4204, - "step": 452 - }, - { - "epoch": 2.4486486486486485, - "grad_norm": 4.783357620239258, - "learning_rate": 4.299004728227782e-06, - "loss": 0.5593, - "step": 453 - }, - { - "epoch": 2.454054054054054, - "grad_norm": 3.014080762863159, - "learning_rate": 4.2960541888573774e-06, - "loss": 0.4187, - "step": 454 - }, - { - "epoch": 2.4594594594594597, - "grad_norm": 3.5906598567962646, - "learning_rate": 4.29309847013268e-06, - "loss": 0.4193, - "step": 455 - }, - { - "epoch": 2.464864864864865, - "grad_norm": 3.9043331146240234, - "learning_rate": 4.290137580577216e-06, - "loss": 0.7035, - "step": 456 - }, - { - "epoch": 2.4702702702702704, - "grad_norm": 3.139753580093384, - "learning_rate": 4.287171528729423e-06, - "loss": 0.5877, - "step": 457 - }, - { - "epoch": 2.4756756756756757, - "grad_norm": 2.9091074466705322, - "learning_rate": 4.284200323142623e-06, - "loss": 0.5309, - "step": 458 - }, - { - "epoch": 2.481081081081081, - "grad_norm": 3.1253795623779297, - "learning_rate": 4.281223972385004e-06, - "loss": 0.448, - "step": 459 - }, - { - "epoch": 2.4864864864864864, - "grad_norm": 2.65510892868042, - "learning_rate": 4.27824248503959e-06, - "loss": 0.4453, - "step": 460 - }, - { - "epoch": 2.4918918918918918, - "grad_norm": 3.2135510444641113, - "learning_rate": 4.275255869704214e-06, - "loss": 0.5582, - "step": 461 - }, - { - "epoch": 2.4972972972972975, - "grad_norm": 2.452545404434204, - "learning_rate": 4.272264134991503e-06, - "loss": 0.423, - "step": 462 - }, - { - "epoch": 2.5027027027027025, - "grad_norm": 2.6370208263397217, - "learning_rate": 4.269267289528843e-06, - "loss": 0.271, - "step": 463 - }, - { - "epoch": 2.5081081081081082, - "grad_norm": 3.31266450881958, - "learning_rate": 4.266265341958356e-06, - "loss": 0.6459, - "step": 464 - }, - { - "epoch": 2.5135135135135136, - "grad_norm": 3.2743148803710938, - "learning_rate": 4.263258300936882e-06, - "loss": 0.2959, - "step": 465 - }, - { - "epoch": 2.518918918918919, - "grad_norm": 2.883549690246582, - "learning_rate": 4.260246175135948e-06, - "loss": 0.3418, - "step": 466 - }, - { - "epoch": 2.5243243243243243, - "grad_norm": 2.7019498348236084, - "learning_rate": 4.257228973241742e-06, - "loss": 0.3459, - "step": 467 - }, - { - "epoch": 2.5297297297297296, - "grad_norm": 3.8166959285736084, - "learning_rate": 4.254206703955092e-06, - "loss": 0.4769, - "step": 468 - }, - { - "epoch": 2.535135135135135, - "grad_norm": 3.264763593673706, - "learning_rate": 4.251179375991438e-06, - "loss": 0.6487, - "step": 469 - }, - { - "epoch": 2.5405405405405403, - "grad_norm": 2.7936933040618896, - "learning_rate": 4.248146998080808e-06, - "loss": 0.5547, - "step": 470 - }, - { - "epoch": 2.545945945945946, - "grad_norm": 3.21852707862854, - "learning_rate": 4.2451095789677945e-06, - "loss": 0.2965, - "step": 471 - }, - { - "epoch": 2.5513513513513515, - "grad_norm": 3.4528985023498535, - "learning_rate": 4.242067127411525e-06, - "loss": 0.3831, - "step": 472 - }, - { - "epoch": 2.556756756756757, - "grad_norm": 4.317023754119873, - "learning_rate": 4.239019652185642e-06, - "loss": 0.1756, - "step": 473 - }, - { - "epoch": 2.562162162162162, - "grad_norm": 3.677452325820923, - "learning_rate": 4.2359671620782725e-06, - "loss": 0.5136, - "step": 474 - }, - { - "epoch": 2.5675675675675675, - "grad_norm": 3.7563393115997314, - "learning_rate": 4.232909665892005e-06, - "loss": 0.6554, - "step": 475 - }, - { - "epoch": 2.572972972972973, - "grad_norm": 3.5125508308410645, - "learning_rate": 4.229847172443866e-06, - "loss": 0.3804, - "step": 476 - }, - { - "epoch": 2.5783783783783782, - "grad_norm": 2.8835806846618652, - "learning_rate": 4.2267796905652926e-06, - "loss": 0.3338, - "step": 477 - }, - { - "epoch": 2.583783783783784, - "grad_norm": 3.2136261463165283, - "learning_rate": 4.223707229102105e-06, - "loss": 0.6163, - "step": 478 - }, - { - "epoch": 2.589189189189189, - "grad_norm": 3.467475175857544, - "learning_rate": 4.220629796914487e-06, - "loss": 0.3005, - "step": 479 - }, - { - "epoch": 2.5945945945945947, - "grad_norm": 3.597490072250366, - "learning_rate": 4.217547402876954e-06, - "loss": 0.56, - "step": 480 - }, - { - "epoch": 2.6, - "grad_norm": 3.2377140522003174, - "learning_rate": 4.214460055878329e-06, - "loss": 0.4512, - "step": 481 - }, - { - "epoch": 2.6054054054054054, - "grad_norm": 2.577746868133545, - "learning_rate": 4.211367764821722e-06, - "loss": 0.3074, - "step": 482 - }, - { - "epoch": 2.610810810810811, - "grad_norm": 3.6584155559539795, - "learning_rate": 4.208270538624497e-06, - "loss": 0.6752, - "step": 483 - }, - { - "epoch": 2.616216216216216, - "grad_norm": 2.602778434753418, - "learning_rate": 4.205168386218251e-06, - "loss": 0.2347, - "step": 484 - }, - { - "epoch": 2.6216216216216215, - "grad_norm": 3.587503433227539, - "learning_rate": 4.2020613165487865e-06, - "loss": 0.5189, - "step": 485 - }, - { - "epoch": 2.627027027027027, - "grad_norm": 3.9341986179351807, - "learning_rate": 4.198949338576086e-06, - "loss": 0.7739, - "step": 486 - }, - { - "epoch": 2.6324324324324326, - "grad_norm": 2.9211957454681396, - "learning_rate": 4.1958324612742875e-06, - "loss": 0.3495, - "step": 487 - }, - { - "epoch": 2.637837837837838, - "grad_norm": 3.29193115234375, - "learning_rate": 4.1927106936316564e-06, - "loss": 0.2257, - "step": 488 - }, - { - "epoch": 2.6432432432432433, - "grad_norm": 3.3687057495117188, - "learning_rate": 4.189584044650559e-06, - "loss": 0.6708, - "step": 489 - }, - { - "epoch": 2.6486486486486487, - "grad_norm": 3.096428155899048, - "learning_rate": 4.186452523347441e-06, - "loss": 0.3126, - "step": 490 - }, - { - "epoch": 2.654054054054054, - "grad_norm": 3.0865559577941895, - "learning_rate": 4.183316138752799e-06, - "loss": 0.4219, - "step": 491 - }, - { - "epoch": 2.6594594594594594, - "grad_norm": 3.389827013015747, - "learning_rate": 4.180174899911149e-06, - "loss": 0.3937, - "step": 492 - }, - { - "epoch": 2.6648648648648647, - "grad_norm": 3.044360637664795, - "learning_rate": 4.177028815881012e-06, - "loss": 0.4098, - "step": 493 - }, - { - "epoch": 2.6702702702702705, - "grad_norm": 2.813094139099121, - "learning_rate": 4.173877895734875e-06, - "loss": 0.3597, - "step": 494 - }, - { - "epoch": 2.6756756756756754, - "grad_norm": 2.4037158489227295, - "learning_rate": 4.1707221485591764e-06, - "loss": 0.3284, - "step": 495 - }, - { - "epoch": 2.6810810810810812, - "grad_norm": 3.049436092376709, - "learning_rate": 4.167561583454272e-06, - "loss": 0.257, - "step": 496 - }, - { - "epoch": 2.6864864864864866, - "grad_norm": 3.458923816680908, - "learning_rate": 4.164396209534411e-06, - "loss": 0.1819, - "step": 497 - }, - { - "epoch": 2.691891891891892, - "grad_norm": 3.3084232807159424, - "learning_rate": 4.161226035927711e-06, - "loss": 0.7109, - "step": 498 - }, - { - "epoch": 2.6972972972972973, - "grad_norm": 3.034550189971924, - "learning_rate": 4.15805107177613e-06, - "loss": 0.6297, - "step": 499 - }, - { - "epoch": 2.7027027027027026, - "grad_norm": 3.5786449909210205, - "learning_rate": 4.15487132623544e-06, - "loss": 0.5195, - "step": 500 - }, - { - "epoch": 2.708108108108108, - "grad_norm": 3.4477646350860596, - "learning_rate": 4.151686808475204e-06, - "loss": 0.2528, - "step": 501 - }, - { - "epoch": 2.7135135135135133, - "grad_norm": 3.0256869792938232, - "learning_rate": 4.148497527678744e-06, - "loss": 0.5013, - "step": 502 - }, - { - "epoch": 2.718918918918919, - "grad_norm": 2.875121593475342, - "learning_rate": 4.145303493043118e-06, - "loss": 0.4109, - "step": 503 - }, - { - "epoch": 2.7243243243243245, - "grad_norm": 2.7204222679138184, - "learning_rate": 4.1421047137790935e-06, - "loss": 0.3197, - "step": 504 - }, - { - "epoch": 2.72972972972973, - "grad_norm": 3.350482702255249, - "learning_rate": 4.13890119911112e-06, - "loss": 0.6369, - "step": 505 - }, - { - "epoch": 2.735135135135135, - "grad_norm": 3.096774101257324, - "learning_rate": 4.135692958277303e-06, - "loss": 0.4581, - "step": 506 - }, - { - "epoch": 2.7405405405405405, - "grad_norm": 2.8896536827087402, - "learning_rate": 4.132480000529375e-06, - "loss": 0.6217, - "step": 507 - }, - { - "epoch": 2.745945945945946, - "grad_norm": 2.643932580947876, - "learning_rate": 4.129262335132676e-06, - "loss": 0.4951, - "step": 508 - }, - { - "epoch": 2.7513513513513512, - "grad_norm": 2.6077864170074463, - "learning_rate": 4.126039971366114e-06, - "loss": 0.2185, - "step": 509 - }, - { - "epoch": 2.756756756756757, - "grad_norm": 2.531507968902588, - "learning_rate": 4.122812918522154e-06, - "loss": 0.5428, - "step": 510 - }, - { - "epoch": 2.762162162162162, - "grad_norm": 4.125836372375488, - "learning_rate": 4.119581185906776e-06, - "loss": 0.5466, - "step": 511 - }, - { - "epoch": 2.7675675675675677, - "grad_norm": 2.9921016693115234, - "learning_rate": 4.1163447828394595e-06, - "loss": 0.3803, - "step": 512 - }, - { - "epoch": 2.772972972972973, - "grad_norm": 2.9517931938171387, - "learning_rate": 4.113103718653152e-06, - "loss": 0.2722, - "step": 513 - }, - { - "epoch": 2.7783783783783784, - "grad_norm": 2.8333382606506348, - "learning_rate": 4.10985800269424e-06, - "loss": 0.333, - "step": 514 - }, - { - "epoch": 2.7837837837837838, - "grad_norm": 2.94168758392334, - "learning_rate": 4.106607644322529e-06, - "loss": 0.2186, - "step": 515 - }, - { - "epoch": 2.789189189189189, - "grad_norm": 3.2743892669677734, - "learning_rate": 4.103352652911207e-06, - "loss": 0.6365, - "step": 516 - }, - { - "epoch": 2.7945945945945945, - "grad_norm": 4.692770004272461, - "learning_rate": 4.100093037846825e-06, - "loss": 0.7261, - "step": 517 - }, - { - "epoch": 2.8, - "grad_norm": 3.2157247066497803, - "learning_rate": 4.0968288085292675e-06, - "loss": 0.2767, - "step": 518 - }, - { - "epoch": 2.8054054054054056, - "grad_norm": 3.196887731552124, - "learning_rate": 4.093559974371725e-06, - "loss": 0.4743, - "step": 519 - }, - { - "epoch": 2.810810810810811, - "grad_norm": 2.406752586364746, - "learning_rate": 4.090286544800667e-06, - "loss": 0.3789, - "step": 520 - }, - { - "epoch": 2.8162162162162163, - "grad_norm": 3.1769447326660156, - "learning_rate": 4.087008529255815e-06, - "loss": 0.6252, - "step": 521 - }, - { - "epoch": 2.8216216216216217, - "grad_norm": 3.068370819091797, - "learning_rate": 4.083725937190115e-06, - "loss": 0.3467, - "step": 522 - }, - { - "epoch": 2.827027027027027, - "grad_norm": 3.2665855884552, - "learning_rate": 4.0804387780697114e-06, - "loss": 0.3857, - "step": 523 - }, - { - "epoch": 2.8324324324324324, - "grad_norm": 3.368759870529175, - "learning_rate": 4.077147061373918e-06, - "loss": 0.4679, - "step": 524 - }, - { - "epoch": 2.8378378378378377, - "grad_norm": 3.989163875579834, - "learning_rate": 4.073850796595192e-06, - "loss": 0.2439, - "step": 525 - }, - { - "epoch": 2.8432432432432435, - "grad_norm": 3.6244685649871826, - "learning_rate": 4.070549993239106e-06, - "loss": 0.435, - "step": 526 - }, - { - "epoch": 2.8486486486486484, - "grad_norm": 3.585151195526123, - "learning_rate": 4.06724466082432e-06, - "loss": 0.5022, - "step": 527 - }, - { - "epoch": 2.854054054054054, - "grad_norm": 3.2420976161956787, - "learning_rate": 4.063934808882555e-06, - "loss": 0.4282, - "step": 528 - }, - { - "epoch": 2.8594594594594596, - "grad_norm": 3.1674294471740723, - "learning_rate": 4.0606204469585656e-06, - "loss": 0.3436, - "step": 529 - }, - { - "epoch": 2.864864864864865, - "grad_norm": 2.6856706142425537, - "learning_rate": 4.057301584610112e-06, - "loss": 0.3889, - "step": 530 - }, - { - "epoch": 2.8702702702702703, - "grad_norm": 3.0438942909240723, - "learning_rate": 4.053978231407931e-06, - "loss": 0.4828, - "step": 531 - }, - { - "epoch": 2.8756756756756756, - "grad_norm": 3.3561246395111084, - "learning_rate": 4.0506503969357115e-06, - "loss": 0.5814, - "step": 532 - }, - { - "epoch": 2.881081081081081, - "grad_norm": 2.5318350791931152, - "learning_rate": 4.047318090790065e-06, - "loss": 0.4768, - "step": 533 - }, - { - "epoch": 2.8864864864864863, - "grad_norm": 2.587224006652832, - "learning_rate": 4.043981322580498e-06, - "loss": 0.4262, - "step": 534 - }, - { - "epoch": 2.891891891891892, - "grad_norm": 2.73926043510437, - "learning_rate": 4.040640101929384e-06, - "loss": 0.421, - "step": 535 - }, - { - "epoch": 2.8972972972972975, - "grad_norm": 3.53908371925354, - "learning_rate": 4.037294438471936e-06, - "loss": 0.4019, - "step": 536 - }, - { - "epoch": 2.902702702702703, - "grad_norm": 3.0980448722839355, - "learning_rate": 4.033944341856181e-06, - "loss": 0.4322, - "step": 537 - }, - { - "epoch": 2.908108108108108, - "grad_norm": 2.9265666007995605, - "learning_rate": 4.030589821742926e-06, - "loss": 0.3841, - "step": 538 - }, - { - "epoch": 2.9135135135135135, - "grad_norm": 3.4082043170928955, - "learning_rate": 4.0272308878057385e-06, - "loss": 0.7083, - "step": 539 - }, - { - "epoch": 2.918918918918919, - "grad_norm": 3.297515630722046, - "learning_rate": 4.023867549730912e-06, - "loss": 0.5688, - "step": 540 - }, - { - "epoch": 2.924324324324324, - "grad_norm": 3.0538225173950195, - "learning_rate": 4.020499817217441e-06, - "loss": 0.5979, - "step": 541 - }, - { - "epoch": 2.92972972972973, - "grad_norm": 3.1792757511138916, - "learning_rate": 4.017127699976992e-06, - "loss": 0.5034, - "step": 542 - }, - { - "epoch": 2.935135135135135, - "grad_norm": 3.1574482917785645, - "learning_rate": 4.013751207733877e-06, - "loss": 0.6656, - "step": 543 - }, - { - "epoch": 2.9405405405405407, - "grad_norm": 2.523123264312744, - "learning_rate": 4.010370350225023e-06, - "loss": 0.2789, - "step": 544 - }, - { - "epoch": 2.945945945945946, - "grad_norm": 3.1950793266296387, - "learning_rate": 4.006985137199945e-06, - "loss": 0.2163, - "step": 545 - }, - { - "epoch": 2.9513513513513514, - "grad_norm": 3.2089648246765137, - "learning_rate": 4.00359557842072e-06, - "loss": 0.4179, - "step": 546 - }, - { - "epoch": 2.9567567567567568, - "grad_norm": 3.852578639984131, - "learning_rate": 4.000201683661958e-06, - "loss": 0.4683, - "step": 547 - }, - { - "epoch": 2.962162162162162, - "grad_norm": 2.7612597942352295, - "learning_rate": 3.996803462710766e-06, - "loss": 0.3506, - "step": 548 - }, - { - "epoch": 2.9675675675675675, - "grad_norm": 4.811823844909668, - "learning_rate": 3.993400925366736e-06, - "loss": 0.6582, - "step": 549 - }, - { - "epoch": 2.972972972972973, - "grad_norm": 3.0135858058929443, - "learning_rate": 3.989994081441902e-06, - "loss": 0.504, - "step": 550 - }, - { - "epoch": 2.9783783783783786, - "grad_norm": 2.710277795791626, - "learning_rate": 3.986582940760717e-06, - "loss": 0.7362, - "step": 551 - }, - { - "epoch": 2.983783783783784, - "grad_norm": 3.175443649291992, - "learning_rate": 3.983167513160025e-06, - "loss": 0.4116, - "step": 552 - }, - { - "epoch": 2.9891891891891893, - "grad_norm": 3.101109743118286, - "learning_rate": 3.979747808489036e-06, - "loss": 0.2188, - "step": 553 - }, - { - "epoch": 2.9945945945945946, - "grad_norm": 3.2320079803466797, - "learning_rate": 3.976323836609289e-06, - "loss": 0.7558, - "step": 554 - }, - { - "epoch": 3.0, - "grad_norm": 3.6071934700012207, - "learning_rate": 3.9728956073946305e-06, - "loss": 0.6491, - "step": 555 - }, - { - "epoch": 3.0054054054054054, - "grad_norm": 3.1119353771209717, - "learning_rate": 3.969463130731183e-06, - "loss": 0.1625, - "step": 556 - }, - { - "epoch": 3.0108108108108107, - "grad_norm": 3.0440328121185303, - "learning_rate": 3.966026416517321e-06, - "loss": 0.311, - "step": 557 - }, - { - "epoch": 3.016216216216216, - "grad_norm": 4.069122791290283, - "learning_rate": 3.962585474663636e-06, - "loss": 0.5299, - "step": 558 - }, - { - "epoch": 3.0216216216216214, - "grad_norm": 2.878645896911621, - "learning_rate": 3.959140315092911e-06, - "loss": 0.2718, - "step": 559 - }, - { - "epoch": 3.027027027027027, - "grad_norm": 3.526695966720581, - "learning_rate": 3.955690947740092e-06, - "loss": 0.2954, - "step": 560 - }, - { - "epoch": 3.0324324324324325, - "grad_norm": 3.25087308883667, - "learning_rate": 3.95223738255226e-06, - "loss": 0.2388, - "step": 561 - }, - { - "epoch": 3.037837837837838, - "grad_norm": 3.5467700958251953, - "learning_rate": 3.9487796294886015e-06, - "loss": 0.2014, - "step": 562 - }, - { - "epoch": 3.0432432432432432, - "grad_norm": 4.397517681121826, - "learning_rate": 3.945317698520379e-06, - "loss": 0.2102, - "step": 563 - }, - { - "epoch": 3.0486486486486486, - "grad_norm": 3.7297182083129883, - "learning_rate": 3.941851599630903e-06, - "loss": 0.499, - "step": 564 - }, - { - "epoch": 3.054054054054054, - "grad_norm": 4.417158603668213, - "learning_rate": 3.938381342815503e-06, - "loss": 0.3392, - "step": 565 - }, - { - "epoch": 3.0594594594594593, - "grad_norm": 4.6037421226501465, - "learning_rate": 3.934906938081499e-06, - "loss": 0.1942, - "step": 566 - }, - { - "epoch": 3.064864864864865, - "grad_norm": 3.5600531101226807, - "learning_rate": 3.931428395448174e-06, - "loss": 0.1753, - "step": 567 - }, - { - "epoch": 3.0702702702702704, - "grad_norm": 2.868013381958008, - "learning_rate": 3.927945724946743e-06, - "loss": 0.2959, - "step": 568 - }, - { - "epoch": 3.075675675675676, - "grad_norm": 3.5543227195739746, - "learning_rate": 3.924458936620322e-06, - "loss": 0.4625, - "step": 569 - }, - { - "epoch": 3.081081081081081, - "grad_norm": 8.972922325134277, - "learning_rate": 3.920968040523904e-06, - "loss": 0.2571, - "step": 570 - }, - { - "epoch": 3.0864864864864865, - "grad_norm": 3.037388324737549, - "learning_rate": 3.917473046724329e-06, - "loss": 0.1438, - "step": 571 - }, - { - "epoch": 3.091891891891892, - "grad_norm": 3.3261702060699463, - "learning_rate": 3.9139739653002525e-06, - "loss": 0.3572, - "step": 572 - }, - { - "epoch": 3.097297297297297, - "grad_norm": 2.425293207168579, - "learning_rate": 3.910470806342117e-06, - "loss": 0.165, - "step": 573 - }, - { - "epoch": 3.1027027027027025, - "grad_norm": 3.5718603134155273, - "learning_rate": 3.9069635799521245e-06, - "loss": 0.3209, - "step": 574 - }, - { - "epoch": 3.108108108108108, - "grad_norm": 3.8211171627044678, - "learning_rate": 3.903452296244204e-06, - "loss": 0.1976, - "step": 575 - }, - { - "epoch": 3.1135135135135137, - "grad_norm": 5.944535255432129, - "learning_rate": 3.899936965343989e-06, - "loss": 0.6074, - "step": 576 - }, - { - "epoch": 3.118918918918919, - "grad_norm": 6.603860378265381, - "learning_rate": 3.89641759738878e-06, - "loss": 0.4051, - "step": 577 - }, - { - "epoch": 3.1243243243243244, - "grad_norm": 6.712981700897217, - "learning_rate": 3.892894202527523e-06, - "loss": 0.3787, - "step": 578 - }, - { - "epoch": 3.1297297297297297, - "grad_norm": 3.267186403274536, - "learning_rate": 3.8893667909207735e-06, - "loss": 0.0927, - "step": 579 - }, - { - "epoch": 3.135135135135135, - "grad_norm": 4.476837158203125, - "learning_rate": 3.88583537274067e-06, - "loss": 0.4706, - "step": 580 - }, - { - "epoch": 3.1405405405405404, - "grad_norm": 4.272335052490234, - "learning_rate": 3.8822999581709085e-06, - "loss": 0.3949, - "step": 581 - }, - { - "epoch": 3.145945945945946, - "grad_norm": 3.6685309410095215, - "learning_rate": 3.878760557406708e-06, - "loss": 0.1971, - "step": 582 - }, - { - "epoch": 3.1513513513513516, - "grad_norm": 3.9899449348449707, - "learning_rate": 3.875217180654779e-06, - "loss": 0.5156, - "step": 583 - }, - { - "epoch": 3.156756756756757, - "grad_norm": 3.866804361343384, - "learning_rate": 3.871669838133303e-06, - "loss": 0.3552, - "step": 584 - }, - { - "epoch": 3.1621621621621623, - "grad_norm": 3.565648317337036, - "learning_rate": 3.868118540071894e-06, - "loss": 0.4369, - "step": 585 - }, - { - "epoch": 3.1675675675675676, - "grad_norm": 3.5073986053466797, - "learning_rate": 3.8645632967115755e-06, - "loss": 0.3694, - "step": 586 - }, - { - "epoch": 3.172972972972973, - "grad_norm": 3.7636868953704834, - "learning_rate": 3.861004118304746e-06, - "loss": 0.3404, - "step": 587 - }, - { - "epoch": 3.1783783783783783, - "grad_norm": 2.940094232559204, - "learning_rate": 3.857441015115154e-06, - "loss": 0.3086, - "step": 588 - }, - { - "epoch": 3.1837837837837837, - "grad_norm": 3.727414608001709, - "learning_rate": 3.8538739974178635e-06, - "loss": 0.253, - "step": 589 - }, - { - "epoch": 3.189189189189189, - "grad_norm": 3.5140156745910645, - "learning_rate": 3.850303075499227e-06, - "loss": 0.2436, - "step": 590 - }, - { - "epoch": 3.1945945945945944, - "grad_norm": 3.545952558517456, - "learning_rate": 3.84672825965686e-06, - "loss": 0.328, - "step": 591 - }, - { - "epoch": 3.2, - "grad_norm": 3.534240484237671, - "learning_rate": 3.843149560199601e-06, - "loss": 0.2687, - "step": 592 - }, - { - "epoch": 3.2054054054054055, - "grad_norm": 2.8464927673339844, - "learning_rate": 3.839566987447492e-06, - "loss": 0.1417, - "step": 593 - }, - { - "epoch": 3.210810810810811, - "grad_norm": 4.138559818267822, - "learning_rate": 3.835980551731743e-06, - "loss": 0.2106, - "step": 594 - }, - { - "epoch": 3.2162162162162162, - "grad_norm": 2.917670249938965, - "learning_rate": 3.8323902633947045e-06, - "loss": 0.3154, - "step": 595 - }, - { - "epoch": 3.2216216216216216, - "grad_norm": 3.029660224914551, - "learning_rate": 3.828796132789835e-06, - "loss": 0.1218, - "step": 596 - }, - { - "epoch": 3.227027027027027, - "grad_norm": 3.2845771312713623, - "learning_rate": 3.825198170281677e-06, - "loss": 0.1336, - "step": 597 - }, - { - "epoch": 3.2324324324324323, - "grad_norm": 3.1375670433044434, - "learning_rate": 3.821596386245819e-06, - "loss": 0.2518, - "step": 598 - }, - { - "epoch": 3.237837837837838, - "grad_norm": 3.0021941661834717, - "learning_rate": 3.817990791068874e-06, - "loss": 0.2762, - "step": 599 - }, - { - "epoch": 3.2432432432432434, - "grad_norm": 4.141000747680664, - "learning_rate": 3.81438139514844e-06, - "loss": 0.2722, - "step": 600 - }, - { - "epoch": 3.2486486486486488, - "grad_norm": 3.9065279960632324, - "learning_rate": 3.8107682088930797e-06, - "loss": 0.3542, - "step": 601 - }, - { - "epoch": 3.254054054054054, - "grad_norm": 3.718417167663574, - "learning_rate": 3.807151242722286e-06, - "loss": 0.344, - "step": 602 - }, - { - "epoch": 3.2594594594594595, - "grad_norm": 4.013717174530029, - "learning_rate": 3.8035305070664484e-06, - "loss": 0.1625, - "step": 603 - }, - { - "epoch": 3.264864864864865, - "grad_norm": 3.348888397216797, - "learning_rate": 3.7999060123668318e-06, - "loss": 0.2925, - "step": 604 - }, - { - "epoch": 3.27027027027027, - "grad_norm": 3.496079206466675, - "learning_rate": 3.7962777690755364e-06, - "loss": 0.1523, - "step": 605 - }, - { - "epoch": 3.2756756756756755, - "grad_norm": 3.07607102394104, - "learning_rate": 3.792645787655476e-06, - "loss": 0.1674, - "step": 606 - }, - { - "epoch": 3.281081081081081, - "grad_norm": 3.4036154747009277, - "learning_rate": 3.7890100785803425e-06, - "loss": 0.2856, - "step": 607 - }, - { - "epoch": 3.2864864864864867, - "grad_norm": 6.092559337615967, - "learning_rate": 3.785370652334577e-06, - "loss": 0.1094, - "step": 608 - }, - { - "epoch": 3.291891891891892, - "grad_norm": 3.9322001934051514, - "learning_rate": 3.7817275194133403e-06, - "loss": 0.2611, - "step": 609 - }, - { - "epoch": 3.2972972972972974, - "grad_norm": 3.189563274383545, - "learning_rate": 3.778080690322483e-06, - "loss": 0.1315, - "step": 610 - }, - { - "epoch": 3.3027027027027027, - "grad_norm": 4.304934024810791, - "learning_rate": 3.774430175578514e-06, - "loss": 0.1686, - "step": 611 - }, - { - "epoch": 3.308108108108108, - "grad_norm": 2.9030067920684814, - "learning_rate": 3.7707759857085706e-06, - "loss": 0.4642, - "step": 612 - }, - { - "epoch": 3.3135135135135134, - "grad_norm": 3.7485930919647217, - "learning_rate": 3.7671181312503886e-06, - "loss": 0.1987, - "step": 613 - }, - { - "epoch": 3.3189189189189188, - "grad_norm": 3.4700896739959717, - "learning_rate": 3.763456622752271e-06, - "loss": 0.3307, - "step": 614 - }, - { - "epoch": 3.3243243243243246, - "grad_norm": 3.0079376697540283, - "learning_rate": 3.7597914707730583e-06, - "loss": 0.1731, - "step": 615 - }, - { - "epoch": 3.32972972972973, - "grad_norm": 3.155235767364502, - "learning_rate": 3.7561226858820984e-06, - "loss": 0.2003, - "step": 616 - }, - { - "epoch": 3.3351351351351353, - "grad_norm": 3.847895622253418, - "learning_rate": 3.7524502786592143e-06, - "loss": 0.4014, - "step": 617 - }, - { - "epoch": 3.3405405405405406, - "grad_norm": 2.7505502700805664, - "learning_rate": 3.7487742596946753e-06, - "loss": 0.205, - "step": 618 - }, - { - "epoch": 3.345945945945946, - "grad_norm": 3.654529571533203, - "learning_rate": 3.7450946395891674e-06, - "loss": 0.2932, - "step": 619 - }, - { - "epoch": 3.3513513513513513, - "grad_norm": 2.9763967990875244, - "learning_rate": 3.7414114289537593e-06, - "loss": 0.2748, - "step": 620 - }, - { - "epoch": 3.3567567567567567, - "grad_norm": 3.889683961868286, - "learning_rate": 3.7377246384098763e-06, - "loss": 0.3665, - "step": 621 - }, - { - "epoch": 3.362162162162162, - "grad_norm": 4.193166732788086, - "learning_rate": 3.7340342785892645e-06, - "loss": 0.3453, - "step": 622 - }, - { - "epoch": 3.3675675675675674, - "grad_norm": 3.4371488094329834, - "learning_rate": 3.7303403601339646e-06, - "loss": 0.473, - "step": 623 - }, - { - "epoch": 3.372972972972973, - "grad_norm": 3.6939027309417725, - "learning_rate": 3.726642893696279e-06, - "loss": 0.3017, - "step": 624 - }, - { - "epoch": 3.3783783783783785, - "grad_norm": 4.904304504394531, - "learning_rate": 3.7229418899387414e-06, - "loss": 0.4841, - "step": 625 - }, - { - "epoch": 3.383783783783784, - "grad_norm": 3.6373438835144043, - "learning_rate": 3.719237359534087e-06, - "loss": 0.3879, - "step": 626 - }, - { - "epoch": 3.389189189189189, - "grad_norm": 3.403676986694336, - "learning_rate": 3.71552931316522e-06, - "loss": 0.3876, - "step": 627 - }, - { - "epoch": 3.3945945945945946, - "grad_norm": 3.2292237281799316, - "learning_rate": 3.7118177615251834e-06, - "loss": 0.4491, - "step": 628 - }, - { - "epoch": 3.4, - "grad_norm": 3.317850351333618, - "learning_rate": 3.70810271531713e-06, - "loss": 0.3763, - "step": 629 - }, - { - "epoch": 3.4054054054054053, - "grad_norm": 3.664735794067383, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.4171, - "step": 630 - }, - { - "epoch": 3.410810810810811, - "grad_norm": 3.781569242477417, - "learning_rate": 3.700662182059936e-06, - "loss": 0.2445, - "step": 631 - }, - { - "epoch": 3.4162162162162164, - "grad_norm": 2.878260850906372, - "learning_rate": 3.696936716467363e-06, - "loss": 0.1347, - "step": 632 - }, - { - "epoch": 3.4216216216216218, - "grad_norm": 2.8670761585235596, - "learning_rate": 3.693207799219846e-06, - "loss": 0.2822, - "step": 633 - }, - { - "epoch": 3.427027027027027, - "grad_norm": 3.9338245391845703, - "learning_rate": 3.689475441070615e-06, - "loss": 0.3425, - "step": 634 - }, - { - "epoch": 3.4324324324324325, - "grad_norm": 3.3172149658203125, - "learning_rate": 3.685739652782822e-06, - "loss": 0.3315, - "step": 635 - }, - { - "epoch": 3.437837837837838, - "grad_norm": 3.9986648559570312, - "learning_rate": 3.682000445129512e-06, - "loss": 0.1841, - "step": 636 - }, - { - "epoch": 3.443243243243243, - "grad_norm": 3.4503986835479736, - "learning_rate": 3.6782578288935896e-06, - "loss": 0.3151, - "step": 637 - }, - { - "epoch": 3.4486486486486485, - "grad_norm": 3.8826167583465576, - "learning_rate": 3.6745118148677882e-06, - "loss": 0.1272, - "step": 638 - }, - { - "epoch": 3.454054054054054, - "grad_norm": 3.0585904121398926, - "learning_rate": 3.6707624138546414e-06, - "loss": 0.2436, - "step": 639 - }, - { - "epoch": 3.4594594594594597, - "grad_norm": 3.8409557342529297, - "learning_rate": 3.6670096366664477e-06, - "loss": 0.6321, - "step": 640 - }, - { - "epoch": 3.464864864864865, - "grad_norm": 3.7260093688964844, - "learning_rate": 3.663253494125244e-06, - "loss": 0.1262, - "step": 641 - }, - { - "epoch": 3.4702702702702704, - "grad_norm": 3.195587396621704, - "learning_rate": 3.6594939970627706e-06, - "loss": 0.2669, - "step": 642 - }, - { - "epoch": 3.4756756756756757, - "grad_norm": 2.565070629119873, - "learning_rate": 3.655731156320441e-06, - "loss": 0.1228, - "step": 643 - }, - { - "epoch": 3.481081081081081, - "grad_norm": 3.745422124862671, - "learning_rate": 3.651964982749312e-06, - "loss": 0.1759, - "step": 644 - }, - { - "epoch": 3.4864864864864864, - "grad_norm": 4.96168327331543, - "learning_rate": 3.648195487210051e-06, - "loss": 0.5677, - "step": 645 - }, - { - "epoch": 3.4918918918918918, - "grad_norm": 3.514446496963501, - "learning_rate": 3.644422680572906e-06, - "loss": 0.1874, - "step": 646 - }, - { - "epoch": 3.4972972972972975, - "grad_norm": 3.1427719593048096, - "learning_rate": 3.640646573717671e-06, - "loss": 0.3225, - "step": 647 - }, - { - "epoch": 3.5027027027027025, - "grad_norm": 3.32208514213562, - "learning_rate": 3.63686717753366e-06, - "loss": 0.102, - "step": 648 - }, - { - "epoch": 3.5081081081081082, - "grad_norm": 3.409299373626709, - "learning_rate": 3.6330845029196697e-06, - "loss": 0.1585, - "step": 649 - }, - { - "epoch": 3.5135135135135136, - "grad_norm": 2.827052116394043, - "learning_rate": 3.629298560783952e-06, - "loss": 0.3046, - "step": 650 - }, - { - "epoch": 3.518918918918919, - "grad_norm": 3.541518211364746, - "learning_rate": 3.6255093620441835e-06, - "loss": 0.2037, - "step": 651 - }, - { - "epoch": 3.5243243243243243, - "grad_norm": 3.067040205001831, - "learning_rate": 3.6217169176274293e-06, - "loss": 0.1784, - "step": 652 - }, - { - "epoch": 3.5297297297297296, - "grad_norm": 4.001040935516357, - "learning_rate": 3.6179212384701146e-06, - "loss": 0.1974, - "step": 653 - }, - { - "epoch": 3.535135135135135, - "grad_norm": 4.03037691116333, - "learning_rate": 3.6141223355179946e-06, - "loss": 0.2161, - "step": 654 - }, - { - "epoch": 3.5405405405405403, - "grad_norm": 3.303591728210449, - "learning_rate": 3.610320219726118e-06, - "loss": 0.1487, - "step": 655 - }, - { - "epoch": 3.545945945945946, - "grad_norm": 4.183008193969727, - "learning_rate": 3.606514902058802e-06, - "loss": 0.2231, - "step": 656 - }, - { - "epoch": 3.5513513513513515, - "grad_norm": 4.2100300788879395, - "learning_rate": 3.602706393489594e-06, - "loss": 0.5068, - "step": 657 - }, - { - "epoch": 3.556756756756757, - "grad_norm": 4.521003246307373, - "learning_rate": 3.598894705001246e-06, - "loss": 0.4621, - "step": 658 - }, - { - "epoch": 3.562162162162162, - "grad_norm": 3.452348470687866, - "learning_rate": 3.5950798475856783e-06, - "loss": 0.285, - "step": 659 - }, - { - "epoch": 3.5675675675675675, - "grad_norm": 3.468987464904785, - "learning_rate": 3.5912618322439487e-06, - "loss": 0.4277, - "step": 660 - }, - { - "epoch": 3.572972972972973, - "grad_norm": 3.431551933288574, - "learning_rate": 3.587440669986224e-06, - "loss": 0.1993, - "step": 661 - }, - { - "epoch": 3.5783783783783782, - "grad_norm": 3.017648220062256, - "learning_rate": 3.5836163718317453e-06, - "loss": 0.272, - "step": 662 - }, - { - "epoch": 3.583783783783784, - "grad_norm": 3.837244987487793, - "learning_rate": 3.5797889488087946e-06, - "loss": 0.6019, - "step": 663 - }, - { - "epoch": 3.589189189189189, - "grad_norm": 3.221762180328369, - "learning_rate": 3.575958411954668e-06, - "loss": 0.3603, - "step": 664 - }, - { - "epoch": 3.5945945945945947, - "grad_norm": 4.279484272003174, - "learning_rate": 3.5721247723156393e-06, - "loss": 0.4656, - "step": 665 - }, - { - "epoch": 3.6, - "grad_norm": 3.723459243774414, - "learning_rate": 3.5682880409469316e-06, - "loss": 0.2466, - "step": 666 - }, - { - "epoch": 3.6054054054054054, - "grad_norm": 2.7260632514953613, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1848, - "step": 667 - }, - { - "epoch": 3.610810810810811, - "grad_norm": 3.6656649112701416, - "learning_rate": 3.5606053472859124e-06, - "loss": 0.4968, - "step": 668 - }, - { - "epoch": 3.616216216216216, - "grad_norm": 4.570294380187988, - "learning_rate": 3.556759407148496e-06, - "loss": 0.316, - "step": 669 - }, - { - "epoch": 3.6216216216216215, - "grad_norm": 3.174433946609497, - "learning_rate": 3.5529104195911258e-06, - "loss": 0.2232, - "step": 670 - }, - { - "epoch": 3.627027027027027, - "grad_norm": 4.481954574584961, - "learning_rate": 3.549058395713285e-06, - "loss": 0.4435, - "step": 671 - }, - { - "epoch": 3.6324324324324326, - "grad_norm": 3.8758301734924316, - "learning_rate": 3.54520334662321e-06, - "loss": 0.1455, - "step": 672 - }, - { - "epoch": 3.637837837837838, - "grad_norm": 3.1699628829956055, - "learning_rate": 3.5413452834378626e-06, - "loss": 0.3037, - "step": 673 - }, - { - "epoch": 3.6432432432432433, - "grad_norm": 3.8971962928771973, - "learning_rate": 3.5374842172828953e-06, - "loss": 0.4309, - "step": 674 - }, - { - "epoch": 3.6486486486486487, - "grad_norm": 3.3087549209594727, - "learning_rate": 3.533620159292621e-06, - "loss": 0.383, - "step": 675 - }, - { - "epoch": 3.654054054054054, - "grad_norm": 2.9413082599639893, - "learning_rate": 3.529753120609982e-06, - "loss": 0.1963, - "step": 676 - }, - { - "epoch": 3.6594594594594594, - "grad_norm": 3.309837818145752, - "learning_rate": 3.5258831123865136e-06, - "loss": 0.1922, - "step": 677 - }, - { - "epoch": 3.6648648648648647, - "grad_norm": 4.124879360198975, - "learning_rate": 3.5220101457823147e-06, - "loss": 0.5589, - "step": 678 - }, - { - "epoch": 3.6702702702702705, - "grad_norm": 3.2587103843688965, - "learning_rate": 3.5181342319660174e-06, - "loss": 0.1757, - "step": 679 - }, - { - "epoch": 3.6756756756756754, - "grad_norm": 4.179666042327881, - "learning_rate": 3.5142553821147498e-06, - "loss": 0.1208, - "step": 680 - }, - { - "epoch": 3.6810810810810812, - "grad_norm": 3.4041192531585693, - "learning_rate": 3.5103736074141106e-06, - "loss": 0.2416, - "step": 681 - }, - { - "epoch": 3.6864864864864866, - "grad_norm": 4.982706546783447, - "learning_rate": 3.5064889190581293e-06, - "loss": 0.3841, - "step": 682 - }, - { - "epoch": 3.691891891891892, - "grad_norm": 3.5895309448242188, - "learning_rate": 3.5026013282492406e-06, - "loss": 0.3723, - "step": 683 - }, - { - "epoch": 3.6972972972972973, - "grad_norm": 3.4824306964874268, - "learning_rate": 3.498710846198247e-06, - "loss": 0.4403, - "step": 684 - }, - { - "epoch": 3.7027027027027026, - "grad_norm": 3.501023054122925, - "learning_rate": 3.494817484124289e-06, - "loss": 0.2813, - "step": 685 - }, - { - "epoch": 3.708108108108108, - "grad_norm": 3.934908151626587, - "learning_rate": 3.490921253254813e-06, - "loss": 0.4287, - "step": 686 - }, - { - "epoch": 3.7135135135135133, - "grad_norm": 3.24141526222229, - "learning_rate": 3.487022164825539e-06, - "loss": 0.234, - "step": 687 - }, - { - "epoch": 3.718918918918919, - "grad_norm": 3.3419880867004395, - "learning_rate": 3.4831202300804246e-06, - "loss": 0.2135, - "step": 688 - }, - { - "epoch": 3.7243243243243245, - "grad_norm": 3.923778772354126, - "learning_rate": 3.479215460271638e-06, - "loss": 0.2725, - "step": 689 - }, - { - "epoch": 3.72972972972973, - "grad_norm": 3.2432096004486084, - "learning_rate": 3.475307866659522e-06, - "loss": 0.228, - "step": 690 - }, - { - "epoch": 3.735135135135135, - "grad_norm": 3.0307705402374268, - "learning_rate": 3.4713974605125634e-06, - "loss": 0.0985, - "step": 691 - }, - { - "epoch": 3.7405405405405405, - "grad_norm": 2.778942346572876, - "learning_rate": 3.4674842531073587e-06, - "loss": 0.2137, - "step": 692 - }, - { - "epoch": 3.745945945945946, - "grad_norm": 3.711315155029297, - "learning_rate": 3.4635682557285833e-06, - "loss": 0.1707, - "step": 693 - }, - { - "epoch": 3.7513513513513512, - "grad_norm": 3.165668487548828, - "learning_rate": 3.459649479668956e-06, - "loss": 0.3021, - "step": 694 - }, - { - "epoch": 3.756756756756757, - "grad_norm": 3.7491254806518555, - "learning_rate": 3.4557279362292117e-06, - "loss": 0.3457, - "step": 695 - }, - { - "epoch": 3.762162162162162, - "grad_norm": 3.271603584289551, - "learning_rate": 3.451803636718064e-06, - "loss": 0.1193, - "step": 696 - }, - { - "epoch": 3.7675675675675677, - "grad_norm": 3.872382402420044, - "learning_rate": 3.447876592452174e-06, - "loss": 0.2261, - "step": 697 - }, - { - "epoch": 3.772972972972973, - "grad_norm": 4.634008407592773, - "learning_rate": 3.4439468147561196e-06, - "loss": 0.5042, - "step": 698 - }, - { - "epoch": 3.7783783783783784, - "grad_norm": 3.6930148601531982, - "learning_rate": 3.440014314962358e-06, - "loss": 0.3481, - "step": 699 - }, - { - "epoch": 3.7837837837837838, - "grad_norm": 4.709466457366943, - "learning_rate": 3.4360791044112e-06, - "loss": 0.2317, - "step": 700 - }, - { - "epoch": 3.789189189189189, - "grad_norm": 4.37923002243042, - "learning_rate": 3.432141194450772e-06, - "loss": 0.395, - "step": 701 - }, - { - "epoch": 3.7945945945945945, - "grad_norm": 3.1600489616394043, - "learning_rate": 3.4282005964369836e-06, - "loss": 0.1767, - "step": 702 - }, - { - "epoch": 3.8, - "grad_norm": 3.9799487590789795, - "learning_rate": 3.424257321733497e-06, - "loss": 0.2146, - "step": 703 - }, - { - "epoch": 3.8054054054054056, - "grad_norm": 2.79176664352417, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.1534, - "step": 704 - }, - { - "epoch": 3.810810810810811, - "grad_norm": 3.0024254322052, - "learning_rate": 3.4163627877506434e-06, - "loss": 0.2513, - "step": 705 - }, - { - "epoch": 3.8162162162162163, - "grad_norm": 2.924475908279419, - "learning_rate": 3.4124115512370636e-06, - "loss": 0.4154, - "step": 706 - }, - { - "epoch": 3.8216216216216217, - "grad_norm": 3.2713992595672607, - "learning_rate": 3.408457683565295e-06, - "loss": 0.1822, - "step": 707 - }, - { - "epoch": 3.827027027027027, - "grad_norm": 3.094003438949585, - "learning_rate": 3.4045011961372675e-06, - "loss": 0.3589, - "step": 708 - }, - { - "epoch": 3.8324324324324324, - "grad_norm": 3.423858404159546, - "learning_rate": 3.4005421003624637e-06, - "loss": 0.4615, - "step": 709 - }, - { - "epoch": 3.8378378378378377, - "grad_norm": 2.038792848587036, - "learning_rate": 3.3965804076578896e-06, - "loss": 0.1001, - "step": 710 - }, - { - "epoch": 3.8432432432432435, - "grad_norm": 2.6447055339813232, - "learning_rate": 3.392616129448039e-06, - "loss": 0.2788, - "step": 711 - }, - { - "epoch": 3.8486486486486484, - "grad_norm": 3.546876907348633, - "learning_rate": 3.3886492771648593e-06, - "loss": 0.2663, - "step": 712 - }, - { - "epoch": 3.854054054054054, - "grad_norm": 2.9587066173553467, - "learning_rate": 3.384679862247726e-06, - "loss": 0.3497, - "step": 713 - }, - { - "epoch": 3.8594594594594596, - "grad_norm": 3.7122113704681396, - "learning_rate": 3.3807078961434013e-06, - "loss": 0.3613, - "step": 714 - }, - { - "epoch": 3.864864864864865, - "grad_norm": 3.157294988632202, - "learning_rate": 3.376733390306004e-06, - "loss": 0.0783, - "step": 715 - }, - { - "epoch": 3.8702702702702703, - "grad_norm": 3.564279317855835, - "learning_rate": 3.372756356196979e-06, - "loss": 0.1617, - "step": 716 - }, - { - "epoch": 3.8756756756756756, - "grad_norm": 4.231864929199219, - "learning_rate": 3.3687768052850595e-06, - "loss": 0.6444, - "step": 717 - }, - { - "epoch": 3.881081081081081, - "grad_norm": 5.480365753173828, - "learning_rate": 3.364794749046239e-06, - "loss": 0.4858, - "step": 718 - }, - { - "epoch": 3.8864864864864863, - "grad_norm": 3.428140878677368, - "learning_rate": 3.3608101989637333e-06, - "loss": 0.3103, - "step": 719 - }, - { - "epoch": 3.891891891891892, - "grad_norm": 3.521989345550537, - "learning_rate": 3.356823166527952e-06, - "loss": 0.2501, - "step": 720 - }, - { - "epoch": 3.8972972972972975, - "grad_norm": 3.287081718444824, - "learning_rate": 3.352833663236463e-06, - "loss": 0.18, - "step": 721 - }, - { - "epoch": 3.902702702702703, - "grad_norm": 3.323146104812622, - "learning_rate": 3.348841700593956e-06, - "loss": 0.12, - "step": 722 - }, - { - "epoch": 3.908108108108108, - "grad_norm": 3.516693115234375, - "learning_rate": 3.3448472901122187e-06, - "loss": 0.2618, - "step": 723 - }, - { - "epoch": 3.9135135135135135, - "grad_norm": 3.8109545707702637, - "learning_rate": 3.340850443310092e-06, - "loss": 0.3689, - "step": 724 - }, - { - "epoch": 3.918918918918919, - "grad_norm": 3.8335933685302734, - "learning_rate": 3.336851171713447e-06, - "loss": 0.2195, - "step": 725 - }, - { - "epoch": 3.924324324324324, - "grad_norm": 3.9054670333862305, - "learning_rate": 3.3328494868551444e-06, - "loss": 0.2602, - "step": 726 - }, - { - "epoch": 3.92972972972973, - "grad_norm": 3.1380631923675537, - "learning_rate": 3.3288454002750046e-06, - "loss": 0.1561, - "step": 727 - }, - { - "epoch": 3.935135135135135, - "grad_norm": 4.304198741912842, - "learning_rate": 3.3248389235197764e-06, - "loss": 0.4469, - "step": 728 - }, - { - "epoch": 3.9405405405405407, - "grad_norm": 3.3321573734283447, - "learning_rate": 3.3208300681430967e-06, - "loss": 0.2246, - "step": 729 - }, - { - "epoch": 3.945945945945946, - "grad_norm": 3.89400315284729, - "learning_rate": 3.3168188457054656e-06, - "loss": 0.2743, - "step": 730 - }, - { - "epoch": 3.9513513513513514, - "grad_norm": 3.393209934234619, - "learning_rate": 3.312805267774209e-06, - "loss": 0.551, - "step": 731 - }, - { - "epoch": 3.9567567567567568, - "grad_norm": 3.711652994155884, - "learning_rate": 3.3087893459234423e-06, - "loss": 0.3522, - "step": 732 - }, - { - "epoch": 3.962162162162162, - "grad_norm": 3.6701200008392334, - "learning_rate": 3.304771091734043e-06, - "loss": 0.3084, - "step": 733 - }, - { - "epoch": 3.9675675675675675, - "grad_norm": 3.1742889881134033, - "learning_rate": 3.300750516793614e-06, - "loss": 0.3406, - "step": 734 - }, - { - "epoch": 3.972972972972973, - "grad_norm": 4.000397682189941, - "learning_rate": 3.2967276326964504e-06, - "loss": 0.3463, - "step": 735 - }, - { - "epoch": 3.9783783783783786, - "grad_norm": 3.7932708263397217, - "learning_rate": 3.2927024510435057e-06, - "loss": 0.3758, - "step": 736 - }, - { - "epoch": 3.983783783783784, - "grad_norm": 3.6258292198181152, - "learning_rate": 3.2886749834423587e-06, - "loss": 0.3328, - "step": 737 - }, - { - "epoch": 3.9891891891891893, - "grad_norm": 4.628194332122803, - "learning_rate": 3.284645241507183e-06, - "loss": 0.6213, - "step": 738 - }, - { - "epoch": 3.9945945945945946, - "grad_norm": 4.173697471618652, - "learning_rate": 3.280613236858707e-06, - "loss": 0.2463, - "step": 739 - }, - { - "epoch": 4.0, - "grad_norm": 2.9315719604492188, - "learning_rate": 3.2765789811241865e-06, - "loss": 0.3501, - "step": 740 - }, - { - "epoch": 4.005405405405406, - "grad_norm": 3.7292938232421875, - "learning_rate": 3.272542485937369e-06, - "loss": 0.1753, - "step": 741 - }, - { - "epoch": 4.010810810810811, - "grad_norm": 3.627298593521118, - "learning_rate": 3.2685037629384587e-06, - "loss": 0.0722, - "step": 742 - }, - { - "epoch": 4.0162162162162165, - "grad_norm": 3.7558975219726562, - "learning_rate": 3.264462823774085e-06, - "loss": 0.2475, - "step": 743 - }, - { - "epoch": 4.021621621621621, - "grad_norm": 2.991217851638794, - "learning_rate": 3.260419680097268e-06, - "loss": 0.1163, - "step": 744 - }, - { - "epoch": 4.027027027027027, - "grad_norm": 3.315901517868042, - "learning_rate": 3.2563743435673855e-06, - "loss": 0.1325, - "step": 745 - }, - { - "epoch": 4.032432432432432, - "grad_norm": 2.9405429363250732, - "learning_rate": 3.252326825850139e-06, - "loss": 0.0466, - "step": 746 - }, - { - "epoch": 4.037837837837838, - "grad_norm": 4.078726291656494, - "learning_rate": 3.2482771386175173e-06, - "loss": 0.1861, - "step": 747 - }, - { - "epoch": 4.043243243243243, - "grad_norm": 3.6752545833587646, - "learning_rate": 3.24422529354777e-06, - "loss": 0.1637, - "step": 748 - }, - { - "epoch": 4.048648648648649, - "grad_norm": 4.471213340759277, - "learning_rate": 3.2401713023253646e-06, - "loss": 0.1379, - "step": 749 - }, - { - "epoch": 4.054054054054054, - "grad_norm": 4.609938144683838, - "learning_rate": 3.2361151766409628e-06, - "loss": 0.1099, - "step": 750 - }, - { - "epoch": 4.059459459459459, - "grad_norm": 3.7480030059814453, - "learning_rate": 3.232056928191376e-06, - "loss": 0.1422, - "step": 751 - }, - { - "epoch": 4.064864864864865, - "grad_norm": 4.23753547668457, - "learning_rate": 3.2279965686795424e-06, - "loss": 0.2716, - "step": 752 - }, - { - "epoch": 4.07027027027027, - "grad_norm": 4.59039306640625, - "learning_rate": 3.2239341098144833e-06, - "loss": 0.3849, - "step": 753 - }, - { - "epoch": 4.075675675675676, - "grad_norm": 2.9332475662231445, - "learning_rate": 3.219869563311277e-06, - "loss": 0.0768, - "step": 754 - }, - { - "epoch": 4.081081081081081, - "grad_norm": 3.8387272357940674, - "learning_rate": 3.2158029408910213e-06, - "loss": 0.112, - "step": 755 - }, - { - "epoch": 4.0864864864864865, - "grad_norm": 2.5676164627075195, - "learning_rate": 3.2117342542807995e-06, - "loss": 0.1054, - "step": 756 - }, - { - "epoch": 4.091891891891892, - "grad_norm": 3.4695913791656494, - "learning_rate": 3.207663515213648e-06, - "loss": 0.1754, - "step": 757 - }, - { - "epoch": 4.097297297297297, - "grad_norm": 3.531060218811035, - "learning_rate": 3.2035907354285234e-06, - "loss": 0.191, - "step": 758 - }, - { - "epoch": 4.102702702702703, - "grad_norm": 3.8944122791290283, - "learning_rate": 3.1995159266702648e-06, - "loss": 0.1083, - "step": 759 - }, - { - "epoch": 4.108108108108108, - "grad_norm": 3.572751998901367, - "learning_rate": 3.1954391006895635e-06, - "loss": 0.0609, - "step": 760 - }, - { - "epoch": 4.113513513513514, - "grad_norm": 3.533867120742798, - "learning_rate": 3.191360269242928e-06, - "loss": 0.049, - "step": 761 - }, - { - "epoch": 4.118918918918919, - "grad_norm": 3.742013454437256, - "learning_rate": 3.18727944409265e-06, - "loss": 0.1642, - "step": 762 - }, - { - "epoch": 4.124324324324324, - "grad_norm": 3.918525457382202, - "learning_rate": 3.1831966370067714e-06, - "loss": 0.1513, - "step": 763 - }, - { - "epoch": 4.12972972972973, - "grad_norm": 4.906899929046631, - "learning_rate": 3.1791118597590467e-06, - "loss": 0.3276, - "step": 764 - }, - { - "epoch": 4.135135135135135, - "grad_norm": 5.704930305480957, - "learning_rate": 3.1750251241289148e-06, - "loss": 0.4011, - "step": 765 - }, - { - "epoch": 4.140540540540541, - "grad_norm": 4.278724193572998, - "learning_rate": 3.1709364419014615e-06, - "loss": 0.2274, - "step": 766 - }, - { - "epoch": 4.145945945945946, - "grad_norm": 3.7831263542175293, - "learning_rate": 3.166845824867384e-06, - "loss": 0.118, - "step": 767 - }, - { - "epoch": 4.151351351351352, - "grad_norm": 3.6355350017547607, - "learning_rate": 3.162753284822962e-06, - "loss": 0.1109, - "step": 768 - }, - { - "epoch": 4.1567567567567565, - "grad_norm": 4.063662052154541, - "learning_rate": 3.1586588335700176e-06, - "loss": 0.1754, - "step": 769 - }, - { - "epoch": 4.162162162162162, - "grad_norm": 3.404348611831665, - "learning_rate": 3.1545624829158873e-06, - "loss": 0.1155, - "step": 770 - }, - { - "epoch": 4.167567567567567, - "grad_norm": 2.7452480792999268, - "learning_rate": 3.1504642446733828e-06, - "loss": 0.0635, - "step": 771 - }, - { - "epoch": 4.172972972972973, - "grad_norm": 2.4755163192749023, - "learning_rate": 3.146364130660761e-06, - "loss": 0.1068, - "step": 772 - }, - { - "epoch": 4.178378378378379, - "grad_norm": 3.0338311195373535, - "learning_rate": 3.142262152701685e-06, - "loss": 0.0637, - "step": 773 - }, - { - "epoch": 4.183783783783784, - "grad_norm": 4.566886901855469, - "learning_rate": 3.138158322625197e-06, - "loss": 0.2703, - "step": 774 - }, - { - "epoch": 4.1891891891891895, - "grad_norm": 4.614205360412598, - "learning_rate": 3.1340526522656765e-06, - "loss": 0.2769, - "step": 775 - }, - { - "epoch": 4.194594594594594, - "grad_norm": 3.4197700023651123, - "learning_rate": 3.1299451534628134e-06, - "loss": 0.1192, - "step": 776 - }, - { - "epoch": 4.2, - "grad_norm": 3.2838752269744873, - "learning_rate": 3.1258358380615674e-06, - "loss": 0.1244, - "step": 777 - }, - { - "epoch": 4.205405405405405, - "grad_norm": 4.484423637390137, - "learning_rate": 3.121724717912138e-06, - "loss": 0.2819, - "step": 778 - }, - { - "epoch": 4.210810810810811, - "grad_norm": 2.6898670196533203, - "learning_rate": 3.1176118048699283e-06, - "loss": 0.1018, - "step": 779 - }, - { - "epoch": 4.216216216216216, - "grad_norm": 3.3304710388183594, - "learning_rate": 3.113497110795514e-06, - "loss": 0.1842, - "step": 780 - }, - { - "epoch": 4.221621621621622, - "grad_norm": 3.29425311088562, - "learning_rate": 3.1093806475546046e-06, - "loss": 0.2299, - "step": 781 - }, - { - "epoch": 4.227027027027027, - "grad_norm": 3.0818686485290527, - "learning_rate": 3.1052624270180116e-06, - "loss": 0.1397, - "step": 782 - }, - { - "epoch": 4.232432432432432, - "grad_norm": 4.569559097290039, - "learning_rate": 3.1011424610616153e-06, - "loss": 0.2236, - "step": 783 - }, - { - "epoch": 4.237837837837838, - "grad_norm": 3.2377943992614746, - "learning_rate": 3.097020761566328e-06, - "loss": 0.1417, - "step": 784 - }, - { - "epoch": 4.243243243243243, - "grad_norm": 5.442404270172119, - "learning_rate": 3.092897340418062e-06, - "loss": 0.1317, - "step": 785 - }, - { - "epoch": 4.248648648648649, - "grad_norm": 4.14007568359375, - "learning_rate": 3.088772209507694e-06, - "loss": 0.1869, - "step": 786 - }, - { - "epoch": 4.254054054054054, - "grad_norm": 3.024740695953369, - "learning_rate": 3.0846453807310317e-06, - "loss": 0.0967, - "step": 787 - }, - { - "epoch": 4.2594594594594595, - "grad_norm": 3.463261365890503, - "learning_rate": 3.080516865988778e-06, - "loss": 0.0731, - "step": 788 - }, - { - "epoch": 4.264864864864865, - "grad_norm": 3.398139715194702, - "learning_rate": 3.076386677186498e-06, - "loss": 0.1912, - "step": 789 - }, - { - "epoch": 4.27027027027027, - "grad_norm": 3.934204339981079, - "learning_rate": 3.0722548262345854e-06, - "loss": 0.2133, - "step": 790 - }, - { - "epoch": 4.275675675675676, - "grad_norm": 5.5322041511535645, - "learning_rate": 3.0681213250482255e-06, - "loss": 0.4454, - "step": 791 - }, - { - "epoch": 4.281081081081081, - "grad_norm": 5.381092071533203, - "learning_rate": 3.0639861855473637e-06, - "loss": 0.3645, - "step": 792 - }, - { - "epoch": 4.286486486486487, - "grad_norm": 4.104682445526123, - "learning_rate": 3.05984941965667e-06, - "loss": 0.1331, - "step": 793 - }, - { - "epoch": 4.291891891891892, - "grad_norm": 3.032749652862549, - "learning_rate": 3.055711039305503e-06, - "loss": 0.0863, - "step": 794 - }, - { - "epoch": 4.297297297297297, - "grad_norm": 3.1181957721710205, - "learning_rate": 3.051571056427879e-06, - "loss": 0.1988, - "step": 795 - }, - { - "epoch": 4.302702702702703, - "grad_norm": 4.8824944496154785, - "learning_rate": 3.047429482962433e-06, - "loss": 0.2307, - "step": 796 - }, - { - "epoch": 4.308108108108108, - "grad_norm": 3.5564794540405273, - "learning_rate": 3.0432863308523903e-06, - "loss": 0.1614, - "step": 797 - }, - { - "epoch": 4.313513513513514, - "grad_norm": 2.928267240524292, - "learning_rate": 3.039141612045525e-06, - "loss": 0.0683, - "step": 798 - }, - { - "epoch": 4.318918918918919, - "grad_norm": 2.846242666244507, - "learning_rate": 3.034995338494131e-06, - "loss": 0.1784, - "step": 799 - }, - { - "epoch": 4.324324324324325, - "grad_norm": 2.8273985385894775, - "learning_rate": 3.0308475221549868e-06, - "loss": 0.0451, - "step": 800 - }, - { - "epoch": 4.3297297297297295, - "grad_norm": 3.0229880809783936, - "learning_rate": 3.026698174989316e-06, - "loss": 0.0618, - "step": 801 - }, - { - "epoch": 4.335135135135135, - "grad_norm": 3.555338144302368, - "learning_rate": 3.0225473089627617e-06, - "loss": 0.1529, - "step": 802 - }, - { - "epoch": 4.34054054054054, - "grad_norm": 3.7206318378448486, - "learning_rate": 3.0183949360453442e-06, - "loss": 0.4177, - "step": 803 - }, - { - "epoch": 4.345945945945946, - "grad_norm": 4.038993835449219, - "learning_rate": 3.014241068211428e-06, - "loss": 0.1394, - "step": 804 - }, - { - "epoch": 4.351351351351352, - "grad_norm": 3.723766565322876, - "learning_rate": 3.0100857174396926e-06, - "loss": 0.04, - "step": 805 - }, - { - "epoch": 4.356756756756757, - "grad_norm": 4.745445728302002, - "learning_rate": 3.0059288957130893e-06, - "loss": 0.2705, - "step": 806 - }, - { - "epoch": 4.3621621621621625, - "grad_norm": 3.245249032974243, - "learning_rate": 3.001770615018815e-06, - "loss": 0.2208, - "step": 807 - }, - { - "epoch": 4.367567567567567, - "grad_norm": 4.631863594055176, - "learning_rate": 2.9976108873482725e-06, - "loss": 0.2068, - "step": 808 - }, - { - "epoch": 4.372972972972973, - "grad_norm": 3.4944963455200195, - "learning_rate": 2.9934497246970357e-06, - "loss": 0.1253, - "step": 809 - }, - { - "epoch": 4.378378378378378, - "grad_norm": 3.393252372741699, - "learning_rate": 2.989287139064819e-06, - "loss": 0.1721, - "step": 810 - }, - { - "epoch": 4.383783783783784, - "grad_norm": 3.2354531288146973, - "learning_rate": 2.9851231424554385e-06, - "loss": 0.134, - "step": 811 - }, - { - "epoch": 4.389189189189189, - "grad_norm": 3.8997225761413574, - "learning_rate": 2.9809577468767813e-06, - "loss": 0.0818, - "step": 812 - }, - { - "epoch": 4.394594594594595, - "grad_norm": 3.4745192527770996, - "learning_rate": 2.9767909643407676e-06, - "loss": 0.1797, - "step": 813 - }, - { - "epoch": 4.4, - "grad_norm": 2.8166556358337402, - "learning_rate": 2.9726228068633155e-06, - "loss": 0.145, - "step": 814 - }, - { - "epoch": 4.405405405405405, - "grad_norm": 3.4947283267974854, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.079, - "step": 815 - }, - { - "epoch": 4.410810810810811, - "grad_norm": 3.8058624267578125, - "learning_rate": 2.9642824151675702e-06, - "loss": 0.1763, - "step": 816 - }, - { - "epoch": 4.416216216216216, - "grad_norm": 3.161440134048462, - "learning_rate": 2.9601102050008016e-06, - "loss": 0.2654, - "step": 817 - }, - { - "epoch": 4.421621621621622, - "grad_norm": 2.7620294094085693, - "learning_rate": 2.955936667995578e-06, - "loss": 0.0779, - "step": 818 - }, - { - "epoch": 4.427027027027027, - "grad_norm": 3.2293593883514404, - "learning_rate": 2.9517618161872974e-06, - "loss": 0.0587, - "step": 819 - }, - { - "epoch": 4.4324324324324325, - "grad_norm": 2.753647565841675, - "learning_rate": 2.9475856616151487e-06, - "loss": 0.0835, - "step": 820 - }, - { - "epoch": 4.437837837837838, - "grad_norm": 3.744755744934082, - "learning_rate": 2.9434082163220773e-06, - "loss": 0.1748, - "step": 821 - }, - { - "epoch": 4.443243243243243, - "grad_norm": 3.5458850860595703, - "learning_rate": 2.9392294923547543e-06, - "loss": 0.119, - "step": 822 - }, - { - "epoch": 4.448648648648649, - "grad_norm": 4.037010192871094, - "learning_rate": 2.9350495017635334e-06, - "loss": 0.1535, - "step": 823 - }, - { - "epoch": 4.454054054054054, - "grad_norm": 3.704439401626587, - "learning_rate": 2.9308682566024228e-06, - "loss": 0.2561, - "step": 824 - }, - { - "epoch": 4.45945945945946, - "grad_norm": 2.9537882804870605, - "learning_rate": 2.92668576892905e-06, - "loss": 0.2024, - "step": 825 - }, - { - "epoch": 4.464864864864865, - "grad_norm": 3.1923575401306152, - "learning_rate": 2.9225020508046233e-06, - "loss": 0.0436, - "step": 826 - }, - { - "epoch": 4.47027027027027, - "grad_norm": 3.304884195327759, - "learning_rate": 2.9183171142939002e-06, - "loss": 0.1636, - "step": 827 - }, - { - "epoch": 4.475675675675676, - "grad_norm": 3.5481832027435303, - "learning_rate": 2.9141309714651528e-06, - "loss": 0.0962, - "step": 828 - }, - { - "epoch": 4.481081081081081, - "grad_norm": 4.0650153160095215, - "learning_rate": 2.9099436343901306e-06, - "loss": 0.2129, - "step": 829 - }, - { - "epoch": 4.486486486486487, - "grad_norm": 4.274670124053955, - "learning_rate": 2.9057551151440266e-06, - "loss": 0.2872, - "step": 830 - }, - { - "epoch": 4.491891891891892, - "grad_norm": 4.45655632019043, - "learning_rate": 2.9015654258054433e-06, - "loss": 0.3254, - "step": 831 - }, - { - "epoch": 4.4972972972972975, - "grad_norm": 3.2205746173858643, - "learning_rate": 2.8973745784563596e-06, - "loss": 0.1417, - "step": 832 - }, - { - "epoch": 4.5027027027027025, - "grad_norm": 3.994489908218384, - "learning_rate": 2.8931825851820904e-06, - "loss": 0.2513, - "step": 833 - }, - { - "epoch": 4.508108108108108, - "grad_norm": 2.8250539302825928, - "learning_rate": 2.8889894580712574e-06, - "loss": 0.1785, - "step": 834 - }, - { - "epoch": 4.513513513513513, - "grad_norm": 3.526552200317383, - "learning_rate": 2.884795209215751e-06, - "loss": 0.2853, - "step": 835 - }, - { - "epoch": 4.518918918918919, - "grad_norm": 3.8975565433502197, - "learning_rate": 2.880599850710696e-06, - "loss": 0.2947, - "step": 836 - }, - { - "epoch": 4.524324324324324, - "grad_norm": 2.86104154586792, - "learning_rate": 2.8764033946544197e-06, - "loss": 0.177, - "step": 837 - }, - { - "epoch": 4.52972972972973, - "grad_norm": 3.967454433441162, - "learning_rate": 2.8722058531484105e-06, - "loss": 0.2786, - "step": 838 - }, - { - "epoch": 4.535135135135135, - "grad_norm": 3.9122490882873535, - "learning_rate": 2.86800723829729e-06, - "loss": 0.1881, - "step": 839 - }, - { - "epoch": 4.54054054054054, - "grad_norm": 3.9732089042663574, - "learning_rate": 2.8638075622087747e-06, - "loss": 0.3541, - "step": 840 - }, - { - "epoch": 4.545945945945946, - "grad_norm": 3.7056405544281006, - "learning_rate": 2.8596068369936386e-06, - "loss": 0.3094, - "step": 841 - }, - { - "epoch": 4.551351351351351, - "grad_norm": 3.5056777000427246, - "learning_rate": 2.8554050747656862e-06, - "loss": 0.1162, - "step": 842 - }, - { - "epoch": 4.556756756756757, - "grad_norm": 3.1131439208984375, - "learning_rate": 2.851202287641709e-06, - "loss": 0.1079, - "step": 843 - }, - { - "epoch": 4.562162162162162, - "grad_norm": 3.6517693996429443, - "learning_rate": 2.8469984877414525e-06, - "loss": 0.4462, - "step": 844 - }, - { - "epoch": 4.5675675675675675, - "grad_norm": 3.0627806186676025, - "learning_rate": 2.842793687187588e-06, - "loss": 0.0851, - "step": 845 - }, - { - "epoch": 4.572972972972973, - "grad_norm": 4.0370893478393555, - "learning_rate": 2.8385878981056663e-06, - "loss": 0.1268, - "step": 846 - }, - { - "epoch": 4.578378378378378, - "grad_norm": 3.486156463623047, - "learning_rate": 2.8343811326240944e-06, - "loss": 0.3187, - "step": 847 - }, - { - "epoch": 4.583783783783784, - "grad_norm": 2.4388604164123535, - "learning_rate": 2.830173402874091e-06, - "loss": 0.1315, - "step": 848 - }, - { - "epoch": 4.589189189189189, - "grad_norm": 3.5970475673675537, - "learning_rate": 2.8259647209896573e-06, - "loss": 0.301, - "step": 849 - }, - { - "epoch": 4.594594594594595, - "grad_norm": 3.657775402069092, - "learning_rate": 2.821755099107541e-06, - "loss": 0.1478, - "step": 850 - }, - { - "epoch": 4.6, - "grad_norm": 3.2040653228759766, - "learning_rate": 2.817544549367197e-06, - "loss": 0.2029, - "step": 851 - }, - { - "epoch": 4.605405405405405, - "grad_norm": 2.778747081756592, - "learning_rate": 2.813333083910761e-06, - "loss": 0.0549, - "step": 852 - }, - { - "epoch": 4.610810810810811, - "grad_norm": 3.661921977996826, - "learning_rate": 2.8091207148830046e-06, - "loss": 0.1508, - "step": 853 - }, - { - "epoch": 4.616216216216216, - "grad_norm": 2.7028398513793945, - "learning_rate": 2.8049074544313094e-06, - "loss": 0.1094, - "step": 854 - }, - { - "epoch": 4.621621621621622, - "grad_norm": 3.3319056034088135, - "learning_rate": 2.8006933147056236e-06, - "loss": 0.0799, - "step": 855 - }, - { - "epoch": 4.627027027027027, - "grad_norm": 3.3194944858551025, - "learning_rate": 2.7964783078584336e-06, - "loss": 0.123, - "step": 856 - }, - { - "epoch": 4.632432432432433, - "grad_norm": 2.4618616104125977, - "learning_rate": 2.792262446044725e-06, - "loss": 0.0692, - "step": 857 - }, - { - "epoch": 4.6378378378378375, - "grad_norm": 4.007084846496582, - "learning_rate": 2.788045741421949e-06, - "loss": 0.1596, - "step": 858 - }, - { - "epoch": 4.643243243243243, - "grad_norm": 2.6852214336395264, - "learning_rate": 2.78382820614999e-06, - "loss": 0.047, - "step": 859 - }, - { - "epoch": 4.648648648648649, - "grad_norm": 3.249666690826416, - "learning_rate": 2.779609852391123e-06, - "loss": 0.1561, - "step": 860 - }, - { - "epoch": 4.654054054054054, - "grad_norm": 7.2313337326049805, - "learning_rate": 2.775390692309987e-06, - "loss": 0.2157, - "step": 861 - }, - { - "epoch": 4.65945945945946, - "grad_norm": 3.1866044998168945, - "learning_rate": 2.7711707380735443e-06, - "loss": 0.0782, - "step": 862 - }, - { - "epoch": 4.664864864864865, - "grad_norm": 3.714812755584717, - "learning_rate": 2.766950001851049e-06, - "loss": 0.2994, - "step": 863 - }, - { - "epoch": 4.6702702702702705, - "grad_norm": 3.0355515480041504, - "learning_rate": 2.7627284958140084e-06, - "loss": 0.109, - "step": 864 - }, - { - "epoch": 4.675675675675675, - "grad_norm": 2.8177638053894043, - "learning_rate": 2.7585062321361517e-06, - "loss": 0.2557, - "step": 865 - }, - { - "epoch": 4.681081081081081, - "grad_norm": 3.7162227630615234, - "learning_rate": 2.75428322299339e-06, - "loss": 0.0413, - "step": 866 - }, - { - "epoch": 4.686486486486486, - "grad_norm": 3.008643627166748, - "learning_rate": 2.7500594805637882e-06, - "loss": 0.0402, - "step": 867 - }, - { - "epoch": 4.691891891891892, - "grad_norm": 3.1683881282806396, - "learning_rate": 2.745835017027522e-06, - "loss": 0.1481, - "step": 868 - }, - { - "epoch": 4.697297297297297, - "grad_norm": 3.2899327278137207, - "learning_rate": 2.74160984456685e-06, - "loss": 0.2242, - "step": 869 - }, - { - "epoch": 4.702702702702703, - "grad_norm": 5.386324882507324, - "learning_rate": 2.737383975366071e-06, - "loss": 0.4693, - "step": 870 - }, - { - "epoch": 4.708108108108108, - "grad_norm": 3.0007741451263428, - "learning_rate": 2.7331574216114963e-06, - "loss": 0.1353, - "step": 871 - }, - { - "epoch": 4.713513513513513, - "grad_norm": 2.7533962726593018, - "learning_rate": 2.728930195491411e-06, - "loss": 0.157, - "step": 872 - }, - { - "epoch": 4.718918918918919, - "grad_norm": 3.349351167678833, - "learning_rate": 2.724702309196038e-06, - "loss": 0.1863, - "step": 873 - }, - { - "epoch": 4.724324324324324, - "grad_norm": 3.2562623023986816, - "learning_rate": 2.720473774917505e-06, - "loss": 0.2874, - "step": 874 - }, - { - "epoch": 4.72972972972973, - "grad_norm": 3.4865262508392334, - "learning_rate": 2.716244604849807e-06, - "loss": 0.1021, - "step": 875 - }, - { - "epoch": 4.735135135135135, - "grad_norm": 3.793647289276123, - "learning_rate": 2.7120148111887732e-06, - "loss": 0.1046, - "step": 876 - }, - { - "epoch": 4.7405405405405405, - "grad_norm": 3.8841137886047363, - "learning_rate": 2.707784406132032e-06, - "loss": 0.0971, - "step": 877 - }, - { - "epoch": 4.745945945945946, - "grad_norm": 3.45615816116333, - "learning_rate": 2.703553401878972e-06, - "loss": 0.0507, - "step": 878 - }, - { - "epoch": 4.751351351351351, - "grad_norm": 3.578495502471924, - "learning_rate": 2.6993218106307146e-06, - "loss": 0.0616, - "step": 879 - }, - { - "epoch": 4.756756756756757, - "grad_norm": 4.271491527557373, - "learning_rate": 2.6950896445900685e-06, - "loss": 0.0908, - "step": 880 - }, - { - "epoch": 4.762162162162162, - "grad_norm": 3.889042615890503, - "learning_rate": 2.690856915961504e-06, - "loss": 0.2426, - "step": 881 - }, - { - "epoch": 4.767567567567568, - "grad_norm": 3.8519232273101807, - "learning_rate": 2.686623636951112e-06, - "loss": 0.1881, - "step": 882 - }, - { - "epoch": 4.772972972972973, - "grad_norm": 3.819518804550171, - "learning_rate": 2.6823898197665703e-06, - "loss": 0.1385, - "step": 883 - }, - { - "epoch": 4.778378378378378, - "grad_norm": 4.091328144073486, - "learning_rate": 2.6781554766171104e-06, - "loss": 0.2913, - "step": 884 - }, - { - "epoch": 4.783783783783784, - "grad_norm": 2.60793399810791, - "learning_rate": 2.673920619713478e-06, - "loss": 0.0874, - "step": 885 - }, - { - "epoch": 4.789189189189189, - "grad_norm": 4.59322452545166, - "learning_rate": 2.6696852612679024e-06, - "loss": 0.2703, - "step": 886 - }, - { - "epoch": 4.794594594594595, - "grad_norm": 3.4631619453430176, - "learning_rate": 2.6654494134940586e-06, - "loss": 0.121, - "step": 887 - }, - { - "epoch": 4.8, - "grad_norm": 3.8556058406829834, - "learning_rate": 2.6612130886070313e-06, - "loss": 0.1853, - "step": 888 - }, - { - "epoch": 4.805405405405406, - "grad_norm": 2.932152271270752, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.0533, - "step": 889 - }, - { - "epoch": 4.8108108108108105, - "grad_norm": 4.647441387176514, - "learning_rate": 2.652739056360618e-06, - "loss": 0.3178, - "step": 890 - }, - { - "epoch": 4.816216216216216, - "grad_norm": 4.682106018066406, - "learning_rate": 2.648501373438142e-06, - "loss": 0.1735, - "step": 891 - }, - { - "epoch": 4.821621621621622, - "grad_norm": 3.1454825401306152, - "learning_rate": 2.644263262276234e-06, - "loss": 0.062, - "step": 892 - }, - { - "epoch": 4.827027027027027, - "grad_norm": 3.579653739929199, - "learning_rate": 2.640024735096507e-06, - "loss": 0.1336, - "step": 893 - }, - { - "epoch": 4.832432432432433, - "grad_norm": 2.558265447616577, - "learning_rate": 2.6357858041217733e-06, - "loss": 0.1404, - "step": 894 - }, - { - "epoch": 4.837837837837838, - "grad_norm": 2.3879470825195312, - "learning_rate": 2.6315464815760104e-06, - "loss": 0.0373, - "step": 895 - }, - { - "epoch": 4.8432432432432435, - "grad_norm": 4.418992042541504, - "learning_rate": 2.6273067796843242e-06, - "loss": 0.3068, - "step": 896 - }, - { - "epoch": 4.848648648648648, - "grad_norm": 3.08585786819458, - "learning_rate": 2.6230667106729157e-06, - "loss": 0.2221, - "step": 897 - }, - { - "epoch": 4.854054054054054, - "grad_norm": 2.9488885402679443, - "learning_rate": 2.618826286769043e-06, - "loss": 0.1431, - "step": 898 - }, - { - "epoch": 4.859459459459459, - "grad_norm": 4.123927116394043, - "learning_rate": 2.614585520200989e-06, - "loss": 0.196, - "step": 899 - }, - { - "epoch": 4.864864864864865, - "grad_norm": 4.289125919342041, - "learning_rate": 2.6103444231980233e-06, - "loss": 0.2509, - "step": 900 - }, - { - "epoch": 4.87027027027027, - "grad_norm": 3.0358095169067383, - "learning_rate": 2.606103007990371e-06, - "loss": 0.0747, - "step": 901 - }, - { - "epoch": 4.875675675675676, - "grad_norm": 3.6471376419067383, - "learning_rate": 2.601861286809172e-06, - "loss": 0.0494, - "step": 902 - }, - { - "epoch": 4.881081081081081, - "grad_norm": 3.424712896347046, - "learning_rate": 2.5976192718864497e-06, - "loss": 0.0901, - "step": 903 - }, - { - "epoch": 4.886486486486486, - "grad_norm": 4.047586441040039, - "learning_rate": 2.593376975455075e-06, - "loss": 0.0465, - "step": 904 - }, - { - "epoch": 4.891891891891892, - "grad_norm": 4.448032379150391, - "learning_rate": 2.5891344097487294e-06, - "loss": 0.0616, - "step": 905 - }, - { - "epoch": 4.897297297297297, - "grad_norm": 3.3522684574127197, - "learning_rate": 2.584891587001872e-06, - "loss": 0.087, - "step": 906 - }, - { - "epoch": 4.902702702702703, - "grad_norm": 2.979238986968994, - "learning_rate": 2.580648519449704e-06, - "loss": 0.053, - "step": 907 - }, - { - "epoch": 4.908108108108108, - "grad_norm": 6.049450397491455, - "learning_rate": 2.5764052193281287e-06, - "loss": 0.2707, - "step": 908 - }, - { - "epoch": 4.9135135135135135, - "grad_norm": 6.647163391113281, - "learning_rate": 2.5721616988737254e-06, - "loss": 0.3679, - "step": 909 - }, - { - "epoch": 4.918918918918919, - "grad_norm": 3.764979839324951, - "learning_rate": 2.567917970323704e-06, - "loss": 0.1929, - "step": 910 - }, - { - "epoch": 4.924324324324324, - "grad_norm": 3.5592362880706787, - "learning_rate": 2.5636740459158776e-06, - "loss": 0.2461, - "step": 911 - }, - { - "epoch": 4.92972972972973, - "grad_norm": 4.4554762840271, - "learning_rate": 2.559429937888624e-06, - "loss": 0.2484, - "step": 912 - }, - { - "epoch": 4.935135135135135, - "grad_norm": 3.358375072479248, - "learning_rate": 2.5551856584808483e-06, - "loss": 0.1886, - "step": 913 - }, - { - "epoch": 4.940540540540541, - "grad_norm": 3.5831756591796875, - "learning_rate": 2.5509412199319515e-06, - "loss": 0.1789, - "step": 914 - }, - { - "epoch": 4.945945945945946, - "grad_norm": 2.4555728435516357, - "learning_rate": 2.5466966344817927e-06, - "loss": 0.1072, - "step": 915 - }, - { - "epoch": 4.951351351351351, - "grad_norm": 4.581109046936035, - "learning_rate": 2.542451914370656e-06, - "loss": 0.2624, - "step": 916 - }, - { - "epoch": 4.956756756756757, - "grad_norm": 2.9763975143432617, - "learning_rate": 2.538207071839213e-06, - "loss": 0.0639, - "step": 917 - }, - { - "epoch": 4.962162162162162, - "grad_norm": 3.516282796859741, - "learning_rate": 2.533962119128487e-06, - "loss": 0.1281, - "step": 918 - }, - { - "epoch": 4.967567567567568, - "grad_norm": 3.0369791984558105, - "learning_rate": 2.529717068479821e-06, - "loss": 0.1771, - "step": 919 - }, - { - "epoch": 4.972972972972973, - "grad_norm": 2.998521327972412, - "learning_rate": 2.5254719321348392e-06, - "loss": 0.2582, - "step": 920 - }, - { - "epoch": 4.978378378378379, - "grad_norm": 3.002901792526245, - "learning_rate": 2.5212267223354143e-06, - "loss": 0.3016, - "step": 921 - }, - { - "epoch": 4.9837837837837835, - "grad_norm": 3.564932346343994, - "learning_rate": 2.5169814513236296e-06, - "loss": 0.2775, - "step": 922 - }, - { - "epoch": 4.989189189189189, - "grad_norm": 3.726227283477783, - "learning_rate": 2.5127361313417447e-06, - "loss": 0.1246, - "step": 923 - }, - { - "epoch": 4.994594594594595, - "grad_norm": 4.766391754150391, - "learning_rate": 2.508490774632162e-06, - "loss": 0.1732, - "step": 924 - }, - { - "epoch": 5.0, - "grad_norm": 2.9859752655029297, - "learning_rate": 2.5042453934373874e-06, - "loss": 0.1107, - "step": 925 - }, - { - "epoch": 5.005405405405406, - "grad_norm": 3.4388909339904785, - "learning_rate": 2.5e-06, - "loss": 0.1074, - "step": 926 - }, - { - "epoch": 5.010810810810811, - "grad_norm": 2.959311008453369, - "learning_rate": 2.4957546065626134e-06, - "loss": 0.0752, - "step": 927 - }, - { - "epoch": 5.0162162162162165, - "grad_norm": 2.047055959701538, - "learning_rate": 2.491509225367839e-06, - "loss": 0.0313, - "step": 928 - }, - { - "epoch": 5.021621621621621, - "grad_norm": 2.310882329940796, - "learning_rate": 2.487263868658256e-06, - "loss": 0.0851, - "step": 929 - }, - { - "epoch": 5.027027027027027, - "grad_norm": 2.3032779693603516, - "learning_rate": 2.483018548676371e-06, - "loss": 0.0443, - "step": 930 - }, - { - "epoch": 5.032432432432432, - "grad_norm": 3.521470785140991, - "learning_rate": 2.478773277664587e-06, - "loss": 0.056, - "step": 931 - }, - { - "epoch": 5.037837837837838, - "grad_norm": 3.8374359607696533, - "learning_rate": 2.4745280678651616e-06, - "loss": 0.1668, - "step": 932 - }, - { - "epoch": 5.043243243243243, - "grad_norm": 3.831840753555298, - "learning_rate": 2.47028293152018e-06, - "loss": 0.0502, - "step": 933 - }, - { - "epoch": 5.048648648648649, - "grad_norm": 3.398419141769409, - "learning_rate": 2.4660378808715147e-06, - "loss": 0.023, - "step": 934 - }, - { - "epoch": 5.054054054054054, - "grad_norm": 3.3384788036346436, - "learning_rate": 2.4617929281607885e-06, - "loss": 0.1418, - "step": 935 - }, - { - "epoch": 5.059459459459459, - "grad_norm": 5.451812744140625, - "learning_rate": 2.457548085629345e-06, - "loss": 0.1167, - "step": 936 - }, - { - "epoch": 5.064864864864865, - "grad_norm": 6.509985446929932, - "learning_rate": 2.4533033655182072e-06, - "loss": 0.0781, - "step": 937 - }, - { - "epoch": 5.07027027027027, - "grad_norm": 4.330167770385742, - "learning_rate": 2.449058780068049e-06, - "loss": 0.0799, - "step": 938 - }, - { - "epoch": 5.075675675675676, - "grad_norm": 3.6900534629821777, - "learning_rate": 2.444814341519152e-06, - "loss": 0.0548, - "step": 939 - }, - { - "epoch": 5.081081081081081, - "grad_norm": 3.347656011581421, - "learning_rate": 2.440570062111376e-06, - "loss": 0.1218, - "step": 940 - }, - { - "epoch": 5.0864864864864865, - "grad_norm": 2.6146252155303955, - "learning_rate": 2.436325954084122e-06, - "loss": 0.0182, - "step": 941 - }, - { - "epoch": 5.091891891891892, - "grad_norm": 2.852694034576416, - "learning_rate": 2.4320820296762964e-06, - "loss": 0.0337, - "step": 942 - }, - { - "epoch": 5.097297297297297, - "grad_norm": 1.9230271577835083, - "learning_rate": 2.4278383011262755e-06, - "loss": 0.0226, - "step": 943 - }, - { - "epoch": 5.102702702702703, - "grad_norm": 2.6784677505493164, - "learning_rate": 2.4235947806718717e-06, - "loss": 0.0207, - "step": 944 - }, - { - "epoch": 5.108108108108108, - "grad_norm": 3.4410207271575928, - "learning_rate": 2.4193514805502972e-06, - "loss": 0.1561, - "step": 945 - }, - { - "epoch": 5.113513513513514, - "grad_norm": 3.165294647216797, - "learning_rate": 2.4151084129981284e-06, - "loss": 0.1727, - "step": 946 - }, - { - "epoch": 5.118918918918919, - "grad_norm": 2.743256092071533, - "learning_rate": 2.4108655902512715e-06, - "loss": 0.1246, - "step": 947 - }, - { - "epoch": 5.124324324324324, - "grad_norm": 3.771273374557495, - "learning_rate": 2.406623024544926e-06, - "loss": 0.1429, - "step": 948 - }, - { - "epoch": 5.12972972972973, - "grad_norm": 3.4866952896118164, - "learning_rate": 2.402380728113551e-06, - "loss": 0.1569, - "step": 949 - }, - { - "epoch": 5.135135135135135, - "grad_norm": 3.5998377799987793, - "learning_rate": 2.3981387131908286e-06, - "loss": 0.1105, - "step": 950 - }, - { - "epoch": 5.140540540540541, - "grad_norm": 6.748101234436035, - "learning_rate": 2.39389699200963e-06, - "loss": 0.3786, - "step": 951 - }, - { - "epoch": 5.145945945945946, - "grad_norm": 4.391526699066162, - "learning_rate": 2.389655576801977e-06, - "loss": 0.0826, - "step": 952 - }, - { - "epoch": 5.151351351351352, - "grad_norm": 4.411531448364258, - "learning_rate": 2.3854144797990123e-06, - "loss": 0.0684, - "step": 953 - }, - { - "epoch": 5.1567567567567565, - "grad_norm": 3.2221450805664062, - "learning_rate": 2.3811737132309584e-06, - "loss": 0.0452, - "step": 954 - }, - { - "epoch": 5.162162162162162, - "grad_norm": 2.926665782928467, - "learning_rate": 2.3769332893270856e-06, - "loss": 0.0465, - "step": 955 - }, - { - "epoch": 5.167567567567567, - "grad_norm": 2.909715414047241, - "learning_rate": 2.372693220315677e-06, - "loss": 0.0551, - "step": 956 - }, - { - "epoch": 5.172972972972973, - "grad_norm": 3.3920676708221436, - "learning_rate": 2.36845351842399e-06, - "loss": 0.0896, - "step": 957 - }, - { - "epoch": 5.178378378378379, - "grad_norm": 2.4355857372283936, - "learning_rate": 2.3642141958782267e-06, - "loss": 0.0565, - "step": 958 - }, - { - "epoch": 5.183783783783784, - "grad_norm": 4.707484722137451, - "learning_rate": 2.3599752649034935e-06, - "loss": 0.1563, - "step": 959 - }, - { - "epoch": 5.1891891891891895, - "grad_norm": 2.0196712017059326, - "learning_rate": 2.3557367377237663e-06, - "loss": 0.0236, - "step": 960 - }, - { - "epoch": 5.194594594594594, - "grad_norm": 2.5355868339538574, - "learning_rate": 2.351498626561858e-06, - "loss": 0.0506, - "step": 961 - }, - { - "epoch": 5.2, - "grad_norm": 3.384859800338745, - "learning_rate": 2.3472609436393827e-06, - "loss": 0.1001, - "step": 962 - }, - { - "epoch": 5.205405405405405, - "grad_norm": 3.557605028152466, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.0951, - "step": 963 - }, - { - "epoch": 5.210810810810811, - "grad_norm": 2.9991750717163086, - "learning_rate": 2.3387869113929695e-06, - "loss": 0.0824, - "step": 964 - }, - { - "epoch": 5.216216216216216, - "grad_norm": 3.3849830627441406, - "learning_rate": 2.3345505865059427e-06, - "loss": 0.0485, - "step": 965 - }, - { - "epoch": 5.221621621621622, - "grad_norm": 3.781913995742798, - "learning_rate": 2.3303147387320985e-06, - "loss": 0.1516, - "step": 966 - }, - { - "epoch": 5.227027027027027, - "grad_norm": 3.5771679878234863, - "learning_rate": 2.3260793802865227e-06, - "loss": 0.1664, - "step": 967 - }, - { - "epoch": 5.232432432432432, - "grad_norm": 3.4213743209838867, - "learning_rate": 2.3218445233828904e-06, - "loss": 0.1127, - "step": 968 - }, - { - "epoch": 5.237837837837838, - "grad_norm": 3.315171003341675, - "learning_rate": 2.31761018023343e-06, - "loss": 0.0445, - "step": 969 - }, - { - "epoch": 5.243243243243243, - "grad_norm": 4.793919563293457, - "learning_rate": 2.3133763630488883e-06, - "loss": 0.1402, - "step": 970 - }, - { - "epoch": 5.248648648648649, - "grad_norm": 2.4062092304229736, - "learning_rate": 2.3091430840384964e-06, - "loss": 0.0332, - "step": 971 - }, - { - "epoch": 5.254054054054054, - "grad_norm": 3.5533835887908936, - "learning_rate": 2.304910355409932e-06, - "loss": 0.1266, - "step": 972 - }, - { - "epoch": 5.2594594594594595, - "grad_norm": 3.447761058807373, - "learning_rate": 2.3006781893692863e-06, - "loss": 0.0281, - "step": 973 - }, - { - "epoch": 5.264864864864865, - "grad_norm": 2.2596893310546875, - "learning_rate": 2.2964465981210283e-06, - "loss": 0.0238, - "step": 974 - }, - { - "epoch": 5.27027027027027, - "grad_norm": 2.9317407608032227, - "learning_rate": 2.2922155938679695e-06, - "loss": 0.0828, - "step": 975 - }, - { - "epoch": 5.275675675675676, - "grad_norm": 4.982219219207764, - "learning_rate": 2.287985188811228e-06, - "loss": 0.1874, - "step": 976 - }, - { - "epoch": 5.281081081081081, - "grad_norm": 2.643747091293335, - "learning_rate": 2.2837553951501935e-06, - "loss": 0.0413, - "step": 977 - }, - { - "epoch": 5.286486486486487, - "grad_norm": 3.7542672157287598, - "learning_rate": 2.279526225082495e-06, - "loss": 0.0909, - "step": 978 - }, - { - "epoch": 5.291891891891892, - "grad_norm": 4.562160015106201, - "learning_rate": 2.275297690803962e-06, - "loss": 0.0798, - "step": 979 - }, - { - "epoch": 5.297297297297297, - "grad_norm": 3.627634048461914, - "learning_rate": 2.271069804508589e-06, - "loss": 0.1456, - "step": 980 - }, - { - "epoch": 5.302702702702703, - "grad_norm": 3.0197503566741943, - "learning_rate": 2.266842578388504e-06, - "loss": 0.085, - "step": 981 - }, - { - "epoch": 5.308108108108108, - "grad_norm": 3.1097187995910645, - "learning_rate": 2.2626160246339303e-06, - "loss": 0.0885, - "step": 982 - }, - { - "epoch": 5.313513513513514, - "grad_norm": 3.504622459411621, - "learning_rate": 2.2583901554331513e-06, - "loss": 0.1543, - "step": 983 - }, - { - "epoch": 5.318918918918919, - "grad_norm": 3.6203200817108154, - "learning_rate": 2.2541649829724783e-06, - "loss": 0.06, - "step": 984 - }, - { - "epoch": 5.324324324324325, - "grad_norm": 3.441621780395508, - "learning_rate": 2.249940519436212e-06, - "loss": 0.0518, - "step": 985 - }, - { - "epoch": 5.3297297297297295, - "grad_norm": 3.5617616176605225, - "learning_rate": 2.2457167770066104e-06, - "loss": 0.1542, - "step": 986 - }, - { - "epoch": 5.335135135135135, - "grad_norm": 2.4165892601013184, - "learning_rate": 2.2414937678638495e-06, - "loss": 0.0338, - "step": 987 - }, - { - "epoch": 5.34054054054054, - "grad_norm": 2.450880289077759, - "learning_rate": 2.2372715041859925e-06, - "loss": 0.0204, - "step": 988 - }, - { - "epoch": 5.345945945945946, - "grad_norm": 3.0658836364746094, - "learning_rate": 2.2330499981489524e-06, - "loss": 0.129, - "step": 989 - }, - { - "epoch": 5.351351351351352, - "grad_norm": 2.368131160736084, - "learning_rate": 2.2288292619264566e-06, - "loss": 0.0307, - "step": 990 - }, - { - "epoch": 5.356756756756757, - "grad_norm": 2.3199515342712402, - "learning_rate": 2.2246093076900145e-06, - "loss": 0.0374, - "step": 991 - }, - { - "epoch": 5.3621621621621625, - "grad_norm": 2.5552587509155273, - "learning_rate": 2.220390147608878e-06, - "loss": 0.0265, - "step": 992 - }, - { - "epoch": 5.367567567567567, - "grad_norm": 3.5336551666259766, - "learning_rate": 2.2161717938500112e-06, - "loss": 0.0468, - "step": 993 - }, - { - "epoch": 5.372972972972973, - "grad_norm": 2.8977596759796143, - "learning_rate": 2.2119542585780513e-06, - "loss": 0.1118, - "step": 994 - }, - { - "epoch": 5.378378378378378, - "grad_norm": 4.2495951652526855, - "learning_rate": 2.2077375539552764e-06, - "loss": 0.2056, - "step": 995 - }, - { - "epoch": 5.383783783783784, - "grad_norm": 3.5974740982055664, - "learning_rate": 2.203521692141568e-06, - "loss": 0.0437, - "step": 996 - }, - { - "epoch": 5.389189189189189, - "grad_norm": 4.290375232696533, - "learning_rate": 2.199306685294377e-06, - "loss": 0.1981, - "step": 997 - }, - { - "epoch": 5.394594594594595, - "grad_norm": 3.3619349002838135, - "learning_rate": 2.1950925455686906e-06, - "loss": 0.0756, - "step": 998 - }, - { - "epoch": 5.4, - "grad_norm": 2.673149585723877, - "learning_rate": 2.1908792851169954e-06, - "loss": 0.0998, - "step": 999 - }, - { - "epoch": 5.405405405405405, - "grad_norm": 2.308863401412964, - "learning_rate": 2.186666916089239e-06, - "loss": 0.0223, - "step": 1000 - }, - { - "epoch": 5.410810810810811, - "grad_norm": 2.606580972671509, - "learning_rate": 2.1824554506328033e-06, - "loss": 0.0489, - "step": 1001 - }, - { - "epoch": 5.416216216216216, - "grad_norm": 1.9544821977615356, - "learning_rate": 2.17824490089246e-06, - "loss": 0.0321, - "step": 1002 - }, - { - "epoch": 5.421621621621622, - "grad_norm": 2.374169111251831, - "learning_rate": 2.174035279010343e-06, - "loss": 0.0167, - "step": 1003 - }, - { - "epoch": 5.427027027027027, - "grad_norm": 2.8189785480499268, - "learning_rate": 2.1698265971259104e-06, - "loss": 0.0588, - "step": 1004 - }, - { - "epoch": 5.4324324324324325, - "grad_norm": 3.0042636394500732, - "learning_rate": 2.1656188673759065e-06, - "loss": 0.0868, - "step": 1005 - }, - { - "epoch": 5.437837837837838, - "grad_norm": 3.351011276245117, - "learning_rate": 2.1614121018943346e-06, - "loss": 0.1131, - "step": 1006 - }, - { - "epoch": 5.443243243243243, - "grad_norm": 1.8294633626937866, - "learning_rate": 2.1572063128124133e-06, - "loss": 0.0285, - "step": 1007 - }, - { - "epoch": 5.448648648648649, - "grad_norm": 2.9738781452178955, - "learning_rate": 2.153001512258548e-06, - "loss": 0.0303, - "step": 1008 - }, - { - "epoch": 5.454054054054054, - "grad_norm": 3.807075023651123, - "learning_rate": 2.1487977123582922e-06, - "loss": 0.3278, - "step": 1009 - }, - { - "epoch": 5.45945945945946, - "grad_norm": 2.4742624759674072, - "learning_rate": 2.144594925234314e-06, - "loss": 0.0346, - "step": 1010 - }, - { - "epoch": 5.464864864864865, - "grad_norm": 2.3810906410217285, - "learning_rate": 2.140393163006362e-06, - "loss": 0.0874, - "step": 1011 - }, - { - "epoch": 5.47027027027027, - "grad_norm": 2.964308738708496, - "learning_rate": 2.1361924377912266e-06, - "loss": 0.0194, - "step": 1012 - }, - { - "epoch": 5.475675675675676, - "grad_norm": 4.374764919281006, - "learning_rate": 2.1319927617027112e-06, - "loss": 0.1193, - "step": 1013 - }, - { - "epoch": 5.481081081081081, - "grad_norm": 2.9093267917633057, - "learning_rate": 2.1277941468515908e-06, - "loss": 0.0331, - "step": 1014 - }, - { - "epoch": 5.486486486486487, - "grad_norm": 3.3543128967285156, - "learning_rate": 2.123596605345582e-06, - "loss": 0.0723, - "step": 1015 - }, - { - "epoch": 5.491891891891892, - "grad_norm": 3.7927865982055664, - "learning_rate": 2.119400149289305e-06, - "loss": 0.0751, - "step": 1016 - }, - { - "epoch": 5.4972972972972975, - "grad_norm": 2.6409950256347656, - "learning_rate": 2.11520479078425e-06, - "loss": 0.0265, - "step": 1017 - }, - { - "epoch": 5.5027027027027025, - "grad_norm": 3.3015005588531494, - "learning_rate": 2.111010541928743e-06, - "loss": 0.1023, - "step": 1018 - }, - { - "epoch": 5.508108108108108, - "grad_norm": 3.591866970062256, - "learning_rate": 2.10681741481791e-06, - "loss": 0.0831, - "step": 1019 - }, - { - "epoch": 5.513513513513513, - "grad_norm": 3.2032251358032227, - "learning_rate": 2.1026254215436408e-06, - "loss": 0.1258, - "step": 1020 - }, - { - "epoch": 5.518918918918919, - "grad_norm": 2.9865293502807617, - "learning_rate": 2.098434574194557e-06, - "loss": 0.0926, - "step": 1021 - }, - { - "epoch": 5.524324324324324, - "grad_norm": 2.4018800258636475, - "learning_rate": 2.094244884855974e-06, - "loss": 0.0306, - "step": 1022 - }, - { - "epoch": 5.52972972972973, - "grad_norm": 3.0807738304138184, - "learning_rate": 2.0900563656098706e-06, - "loss": 0.1374, - "step": 1023 - }, - { - "epoch": 5.535135135135135, - "grad_norm": 3.1328487396240234, - "learning_rate": 2.085869028534848e-06, - "loss": 0.1173, - "step": 1024 - }, - { - "epoch": 5.54054054054054, - "grad_norm": 3.2709290981292725, - "learning_rate": 2.0816828857061e-06, - "loss": 0.146, - "step": 1025 - }, - { - "epoch": 5.545945945945946, - "grad_norm": 4.698089122772217, - "learning_rate": 2.077497949195378e-06, - "loss": 0.1542, - "step": 1026 - }, - { - "epoch": 5.551351351351351, - "grad_norm": 2.902589797973633, - "learning_rate": 2.073314231070951e-06, - "loss": 0.0699, - "step": 1027 - }, - { - "epoch": 5.556756756756757, - "grad_norm": 4.043124198913574, - "learning_rate": 2.069131743397578e-06, - "loss": 0.1429, - "step": 1028 - }, - { - "epoch": 5.562162162162162, - "grad_norm": 3.168281316757202, - "learning_rate": 2.0649504982364674e-06, - "loss": 0.1203, - "step": 1029 - }, - { - "epoch": 5.5675675675675675, - "grad_norm": 2.7638514041900635, - "learning_rate": 2.0607705076452465e-06, - "loss": 0.1078, - "step": 1030 - }, - { - "epoch": 5.572972972972973, - "grad_norm": 3.3716790676116943, - "learning_rate": 2.056591783677923e-06, - "loss": 0.0881, - "step": 1031 - }, - { - "epoch": 5.578378378378378, - "grad_norm": 3.6879029273986816, - "learning_rate": 2.0524143383848525e-06, - "loss": 0.0586, - "step": 1032 - }, - { - "epoch": 5.583783783783784, - "grad_norm": 5.253712177276611, - "learning_rate": 2.048238183812704e-06, - "loss": 0.3671, - "step": 1033 - }, - { - "epoch": 5.589189189189189, - "grad_norm": 3.237152099609375, - "learning_rate": 2.0440633320044224e-06, - "loss": 0.048, - "step": 1034 - }, - { - "epoch": 5.594594594594595, - "grad_norm": 3.8771812915802, - "learning_rate": 2.0398897949991992e-06, - "loss": 0.2091, - "step": 1035 - }, - { - "epoch": 5.6, - "grad_norm": 4.612788200378418, - "learning_rate": 2.0357175848324306e-06, - "loss": 0.1295, - "step": 1036 - }, - { - "epoch": 5.605405405405405, - "grad_norm": 3.0990102291107178, - "learning_rate": 2.031546713535688e-06, - "loss": 0.0504, - "step": 1037 - }, - { - "epoch": 5.610810810810811, - "grad_norm": 4.607776641845703, - "learning_rate": 2.027377193136684e-06, - "loss": 0.1816, - "step": 1038 - }, - { - "epoch": 5.616216216216216, - "grad_norm": 2.6812732219696045, - "learning_rate": 2.0232090356592333e-06, - "loss": 0.0392, - "step": 1039 - }, - { - "epoch": 5.621621621621622, - "grad_norm": 2.9481258392333984, - "learning_rate": 2.0190422531232186e-06, - "loss": 0.0273, - "step": 1040 - }, - { - "epoch": 5.627027027027027, - "grad_norm": 2.7125625610351562, - "learning_rate": 2.014876857544562e-06, - "loss": 0.0672, - "step": 1041 - }, - { - "epoch": 5.632432432432433, - "grad_norm": 3.4124906063079834, - "learning_rate": 2.0107128609351817e-06, - "loss": 0.0749, - "step": 1042 - }, - { - "epoch": 5.6378378378378375, - "grad_norm": 2.9229767322540283, - "learning_rate": 2.006550275302965e-06, - "loss": 0.0713, - "step": 1043 - }, - { - "epoch": 5.643243243243243, - "grad_norm": 3.2177693843841553, - "learning_rate": 2.002389112651728e-06, - "loss": 0.0547, - "step": 1044 - }, - { - "epoch": 5.648648648648649, - "grad_norm": 2.5188214778900146, - "learning_rate": 1.9982293849811852e-06, - "loss": 0.0304, - "step": 1045 - }, - { - "epoch": 5.654054054054054, - "grad_norm": 2.8611507415771484, - "learning_rate": 1.994071104286911e-06, - "loss": 0.0227, - "step": 1046 - }, - { - "epoch": 5.65945945945946, - "grad_norm": 2.2558059692382812, - "learning_rate": 1.9899142825603078e-06, - "loss": 0.0811, - "step": 1047 - }, - { - "epoch": 5.664864864864865, - "grad_norm": 2.3414204120635986, - "learning_rate": 1.9857589317885727e-06, - "loss": 0.0292, - "step": 1048 - }, - { - "epoch": 5.6702702702702705, - "grad_norm": 2.4263527393341064, - "learning_rate": 1.9816050639546566e-06, - "loss": 0.0386, - "step": 1049 - }, - { - "epoch": 5.675675675675675, - "grad_norm": 3.6473093032836914, - "learning_rate": 1.977452691037239e-06, - "loss": 0.1448, - "step": 1050 - }, - { - "epoch": 5.681081081081081, - "grad_norm": 2.8061227798461914, - "learning_rate": 1.973301825010685e-06, - "loss": 0.0451, - "step": 1051 - }, - { - "epoch": 5.686486486486486, - "grad_norm": 2.5342822074890137, - "learning_rate": 1.9691524778450145e-06, - "loss": 0.0708, - "step": 1052 - }, - { - "epoch": 5.691891891891892, - "grad_norm": 2.632966995239258, - "learning_rate": 1.96500466150587e-06, - "loss": 0.0311, - "step": 1053 - }, - { - "epoch": 5.697297297297297, - "grad_norm": 2.9255290031433105, - "learning_rate": 1.960858387954476e-06, - "loss": 0.0728, - "step": 1054 - }, - { - "epoch": 5.702702702702703, - "grad_norm": 3.292577028274536, - "learning_rate": 1.956713669147611e-06, - "loss": 0.1429, - "step": 1055 - }, - { - "epoch": 5.708108108108108, - "grad_norm": 2.7926251888275146, - "learning_rate": 1.9525705170375674e-06, - "loss": 0.0702, - "step": 1056 - }, - { - "epoch": 5.713513513513513, - "grad_norm": 2.432650089263916, - "learning_rate": 1.948428943572121e-06, - "loss": 0.0934, - "step": 1057 - }, - { - "epoch": 5.718918918918919, - "grad_norm": 2.3497097492218018, - "learning_rate": 1.944288960694497e-06, - "loss": 0.0327, - "step": 1058 - }, - { - "epoch": 5.724324324324324, - "grad_norm": 2.9069294929504395, - "learning_rate": 1.9401505803433308e-06, - "loss": 0.1025, - "step": 1059 - }, - { - "epoch": 5.72972972972973, - "grad_norm": 3.2904415130615234, - "learning_rate": 1.9360138144526363e-06, - "loss": 0.0825, - "step": 1060 - }, - { - "epoch": 5.735135135135135, - "grad_norm": 3.0035643577575684, - "learning_rate": 1.9318786749517754e-06, - "loss": 0.164, - "step": 1061 - }, - { - "epoch": 5.7405405405405405, - "grad_norm": 3.595271110534668, - "learning_rate": 1.9277451737654154e-06, - "loss": 0.0574, - "step": 1062 - }, - { - "epoch": 5.745945945945946, - "grad_norm": 3.5074777603149414, - "learning_rate": 1.923613322813503e-06, - "loss": 0.2916, - "step": 1063 - }, - { - "epoch": 5.751351351351351, - "grad_norm": 2.7535500526428223, - "learning_rate": 1.9194831340112228e-06, - "loss": 0.0626, - "step": 1064 - }, - { - "epoch": 5.756756756756757, - "grad_norm": 2.958237886428833, - "learning_rate": 1.915354619268969e-06, - "loss": 0.0544, - "step": 1065 - }, - { - "epoch": 5.762162162162162, - "grad_norm": 2.9726474285125732, - "learning_rate": 1.9112277904923064e-06, - "loss": 0.0145, - "step": 1066 - }, - { - "epoch": 5.767567567567568, - "grad_norm": 2.744746446609497, - "learning_rate": 1.9071026595819387e-06, - "loss": 0.0335, - "step": 1067 - }, - { - "epoch": 5.772972972972973, - "grad_norm": 3.1849920749664307, - "learning_rate": 1.902979238433673e-06, - "loss": 0.1385, - "step": 1068 - }, - { - "epoch": 5.778378378378378, - "grad_norm": 2.9969868659973145, - "learning_rate": 1.8988575389383853e-06, - "loss": 0.0523, - "step": 1069 - }, - { - "epoch": 5.783783783783784, - "grad_norm": 3.8293309211730957, - "learning_rate": 1.8947375729819894e-06, - "loss": 0.171, - "step": 1070 - }, - { - "epoch": 5.789189189189189, - "grad_norm": 2.845538854598999, - "learning_rate": 1.8906193524453964e-06, - "loss": 0.0431, - "step": 1071 - }, - { - "epoch": 5.794594594594595, - "grad_norm": 1.819235920906067, - "learning_rate": 1.886502889204487e-06, - "loss": 0.0157, - "step": 1072 - }, - { - "epoch": 5.8, - "grad_norm": 3.492358684539795, - "learning_rate": 1.882388195130073e-06, - "loss": 0.0892, - "step": 1073 - }, - { - "epoch": 5.805405405405406, - "grad_norm": 2.1627602577209473, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.0376, - "step": 1074 - }, - { - "epoch": 5.8108108108108105, - "grad_norm": 3.6203341484069824, - "learning_rate": 1.8741641619384343e-06, - "loss": 0.1174, - "step": 1075 - }, - { - "epoch": 5.816216216216216, - "grad_norm": 2.3573997020721436, - "learning_rate": 1.8700548465371877e-06, - "loss": 0.0191, - "step": 1076 - }, - { - "epoch": 5.821621621621622, - "grad_norm": 3.5267531871795654, - "learning_rate": 1.8659473477343233e-06, - "loss": 0.1243, - "step": 1077 - }, - { - "epoch": 5.827027027027027, - "grad_norm": 3.5826189517974854, - "learning_rate": 1.8618416773748032e-06, - "loss": 0.1457, - "step": 1078 - }, - { - "epoch": 5.832432432432433, - "grad_norm": 2.7825980186462402, - "learning_rate": 1.8577378472983148e-06, - "loss": 0.0366, - "step": 1079 - }, - { - "epoch": 5.837837837837838, - "grad_norm": 2.7613232135772705, - "learning_rate": 1.8536358693392398e-06, - "loss": 0.065, - "step": 1080 - }, - { - "epoch": 5.8432432432432435, - "grad_norm": 3.1205132007598877, - "learning_rate": 1.8495357553266176e-06, - "loss": 0.1902, - "step": 1081 - }, - { - "epoch": 5.848648648648648, - "grad_norm": 2.7488930225372314, - "learning_rate": 1.8454375170841133e-06, - "loss": 0.0372, - "step": 1082 - }, - { - "epoch": 5.854054054054054, - "grad_norm": 3.496779441833496, - "learning_rate": 1.841341166429983e-06, - "loss": 0.0942, - "step": 1083 - }, - { - "epoch": 5.859459459459459, - "grad_norm": 3.724827527999878, - "learning_rate": 1.8372467151770391e-06, - "loss": 0.2317, - "step": 1084 - }, - { - "epoch": 5.864864864864865, - "grad_norm": 4.659550666809082, - "learning_rate": 1.8331541751326168e-06, - "loss": 0.1935, - "step": 1085 - }, - { - "epoch": 5.87027027027027, - "grad_norm": 4.368297100067139, - "learning_rate": 1.8290635580985395e-06, - "loss": 0.0905, - "step": 1086 - }, - { - "epoch": 5.875675675675676, - "grad_norm": 2.669170618057251, - "learning_rate": 1.8249748758710856e-06, - "loss": 0.0931, - "step": 1087 - }, - { - "epoch": 5.881081081081081, - "grad_norm": 2.9962668418884277, - "learning_rate": 1.8208881402409542e-06, - "loss": 0.0878, - "step": 1088 - }, - { - "epoch": 5.886486486486486, - "grad_norm": 4.08193302154541, - "learning_rate": 1.8168033629932296e-06, - "loss": 0.1317, - "step": 1089 - }, - { - "epoch": 5.891891891891892, - "grad_norm": 3.038261651992798, - "learning_rate": 1.8127205559073507e-06, - "loss": 0.027, - "step": 1090 - }, - { - "epoch": 5.897297297297297, - "grad_norm": 3.1188318729400635, - "learning_rate": 1.8086397307570724e-06, - "loss": 0.0872, - "step": 1091 - }, - { - "epoch": 5.902702702702703, - "grad_norm": 3.2329025268554688, - "learning_rate": 1.8045608993104373e-06, - "loss": 0.0821, - "step": 1092 - }, - { - "epoch": 5.908108108108108, - "grad_norm": 3.268589735031128, - "learning_rate": 1.8004840733297365e-06, - "loss": 0.0327, - "step": 1093 - }, - { - "epoch": 5.9135135135135135, - "grad_norm": 2.68831729888916, - "learning_rate": 1.7964092645714777e-06, - "loss": 0.0497, - "step": 1094 - }, - { - "epoch": 5.918918918918919, - "grad_norm": 2.5666730403900146, - "learning_rate": 1.7923364847863527e-06, - "loss": 0.0307, - "step": 1095 - }, - { - "epoch": 5.924324324324324, - "grad_norm": 4.285571098327637, - "learning_rate": 1.7882657457192015e-06, - "loss": 0.0897, - "step": 1096 - }, - { - "epoch": 5.92972972972973, - "grad_norm": 4.338192939758301, - "learning_rate": 1.784197059108979e-06, - "loss": 0.1545, - "step": 1097 - }, - { - "epoch": 5.935135135135135, - "grad_norm": 3.0083415508270264, - "learning_rate": 1.7801304366887235e-06, - "loss": 0.0509, - "step": 1098 - }, - { - "epoch": 5.940540540540541, - "grad_norm": 5.343819618225098, - "learning_rate": 1.776065890185517e-06, - "loss": 0.0821, - "step": 1099 - }, - { - "epoch": 5.945945945945946, - "grad_norm": 2.2563998699188232, - "learning_rate": 1.7720034313204582e-06, - "loss": 0.0182, - "step": 1100 - }, - { - "epoch": 5.951351351351351, - "grad_norm": 3.2145767211914062, - "learning_rate": 1.7679430718086244e-06, - "loss": 0.1027, - "step": 1101 - }, - { - "epoch": 5.956756756756757, - "grad_norm": 3.159283399581909, - "learning_rate": 1.763884823359038e-06, - "loss": 0.0413, - "step": 1102 - }, - { - "epoch": 5.962162162162162, - "grad_norm": 3.57746958732605, - "learning_rate": 1.759828697674636e-06, - "loss": 0.1079, - "step": 1103 - }, - { - "epoch": 5.967567567567568, - "grad_norm": 2.7590816020965576, - "learning_rate": 1.7557747064522312e-06, - "loss": 0.0952, - "step": 1104 - }, - { - "epoch": 5.972972972972973, - "grad_norm": 4.943508148193359, - "learning_rate": 1.7517228613824836e-06, - "loss": 0.3393, - "step": 1105 - }, - { - "epoch": 5.978378378378379, - "grad_norm": 1.6088807582855225, - "learning_rate": 1.747673174149862e-06, - "loss": 0.0207, - "step": 1106 - }, - { - "epoch": 5.9837837837837835, - "grad_norm": 3.843369483947754, - "learning_rate": 1.743625656432615e-06, - "loss": 0.1708, - "step": 1107 - }, - { - "epoch": 5.989189189189189, - "grad_norm": 2.520202159881592, - "learning_rate": 1.7395803199027325e-06, - "loss": 0.0569, - "step": 1108 - }, - { - "epoch": 5.994594594594595, - "grad_norm": 4.245851993560791, - "learning_rate": 1.7355371762259155e-06, - "loss": 0.0861, - "step": 1109 - }, - { - "epoch": 6.0, - "grad_norm": 2.90023136138916, - "learning_rate": 1.7314962370615423e-06, - "loss": 0.0571, - "step": 1110 - } - ], - "logging_steps": 1, - "max_steps": 1850, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 2.994751443912622e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00001-of-00007.safetensors deleted file mode 100644 index 995c3e4b0bb2ca62187a0a8beb23a571f23cdf8e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a63f3797412130dd706e0a66e8125393bc20f01b0e80f812afc3f7893fe9e653 -size 4886466168 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00002-of-00007.safetensors deleted file mode 100644 index 78ef8cce7696e51d40d6d745511f897b63bc1f86..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4118addc3fecfab9f1f6681cdbeeac154a45084cde65544b5da37311fcede7f1 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00003-of-00007.safetensors deleted file mode 100644 index 020d25258b890924c8009add42c4eaca285aa4ba..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:45ac47800861415144b221cb5d03ddbcc5210a6a6c95626d66e25da47e663cb3 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00004-of-00007.safetensors deleted file mode 100644 index 7c9c8611bb6837e75925b25706e72e3877c38f0d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba6f90434a6121636d28646e1a60986357c0050ff8c0550e7454bc913a640d64 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00005-of-00007.safetensors deleted file mode 100644 index 59a5b0b6e50d03f7754fd51335a312d209598115..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fdec2c45029ded41d1a7fa2b0cc862610f52490452130c5647a5a1531f6cbf13 -size 4832007496 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00006-of-00007.safetensors deleted file mode 100644 index 9550a8352d519dcbb9f4ff7d82becfef2527d2d8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa2e07c94e705e81fd05f2f57d4405030c62231e7c19e84d0c34f4166bcf8b09 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00007-of-00007.safetensors deleted file mode 100644 index 6e711e9d91485f2f51db928d6869ee35fc376dcb..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8484486ecd1ed6a26b70dfe7e51b2b2e2ea3b86a86b4f5ebc81db75cd1b2fc1c -size 2571158184 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_0.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_0.pth deleted file mode 100644 index 9c287de26f76b389db025ad109f0595b0b77fd22..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92cc13315f24c28015d695b6cde08bb1cd6fea4cbc435998485ed6fbe4c91285 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_1.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_1.pth deleted file mode 100644 index 132db267a0f5617620f48bc8eab9cc37a9aea13a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4c154b6a63e0b1f98f7d2847944398f99f1657d35e8eddf7fdf0ae2c24b0552 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_2.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_2.pth deleted file mode 100644 index e85bf2eceab47cefd59df592648941c61c84eab1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f784c6a9507b51189f2caffbd178ea9882103b75852e31c15f47fdae6a43af1d -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_3.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_3.pth deleted file mode 100644 index 423bb6c008eeb6875c659dd108c5f003758dbcb9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34b023e05bc2d12b91dc436d4922b990d50ec8dc56d40dc3e36b3bb34fc81341 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/scheduler.pt b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/scheduler.pt deleted file mode 100644 index f654465e189dfc97a01ffb57a0ee39690d5412b9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca3e0d9eb722cd188c542e1d81e8aaf99e1f9cbd0560af49f372603df303d0d9 -size 1064 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/tokenizer.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-185/trainer_state.json deleted file mode 100644 index c6ed42ff1e9e62408761322ef4f95258eeeece1e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-185/trainer_state.json +++ /dev/null @@ -1,1329 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 185, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.005405405405405406, - "grad_norm": 72.60939025878906, - "learning_rate": 5e-06, - "loss": 2.9165, - "step": 1 - }, - { - "epoch": 0.010810810810810811, - "grad_norm": 29.01830291748047, - "learning_rate": 4.999996395324314e-06, - "loss": 1.9314, - "step": 2 - }, - { - "epoch": 0.016216216216216217, - "grad_norm": 21.44908332824707, - "learning_rate": 4.99998558130765e-06, - "loss": 1.5709, - "step": 3 - }, - { - "epoch": 0.021621621621621623, - "grad_norm": 4.490907669067383, - "learning_rate": 4.999967557981192e-06, - "loss": 0.8099, - "step": 4 - }, - { - "epoch": 0.02702702702702703, - "grad_norm": 4.000796794891357, - "learning_rate": 4.999942325396917e-06, - "loss": 0.9021, - "step": 5 - }, - { - "epoch": 0.032432432432432434, - "grad_norm": 18.513282775878906, - "learning_rate": 4.999909883627588e-06, - "loss": 1.7972, - "step": 6 - }, - { - "epoch": 0.03783783783783784, - "grad_norm": 3.5735981464385986, - "learning_rate": 4.999870232766757e-06, - "loss": 1.4306, - "step": 7 - }, - { - "epoch": 0.043243243243243246, - "grad_norm": 3.1145193576812744, - "learning_rate": 4.9998233729287696e-06, - "loss": 1.051, - "step": 8 - }, - { - "epoch": 0.04864864864864865, - "grad_norm": 3.856376886367798, - "learning_rate": 4.999769304248755e-06, - "loss": 0.8089, - "step": 9 - }, - { - "epoch": 0.05405405405405406, - "grad_norm": 4.05589485168457, - "learning_rate": 4.9997080268826344e-06, - "loss": 1.0999, - "step": 10 - }, - { - "epoch": 0.05945945945945946, - "grad_norm": 13.784229278564453, - "learning_rate": 4.9996395410071165e-06, - "loss": 1.2831, - "step": 11 - }, - { - "epoch": 0.06486486486486487, - "grad_norm": 6.079237937927246, - "learning_rate": 4.999563846819696e-06, - "loss": 1.2874, - "step": 12 - }, - { - "epoch": 0.07027027027027027, - "grad_norm": 4.5971245765686035, - "learning_rate": 4.999480944538655e-06, - "loss": 0.96, - "step": 13 - }, - { - "epoch": 0.07567567567567568, - "grad_norm": 4.916017532348633, - "learning_rate": 4.999390834403063e-06, - "loss": 0.9869, - "step": 14 - }, - { - "epoch": 0.08108108108108109, - "grad_norm": 3.2311055660247803, - "learning_rate": 4.999293516672773e-06, - "loss": 0.9293, - "step": 15 - }, - { - "epoch": 0.08648648648648649, - "grad_norm": 3.3040921688079834, - "learning_rate": 4.9991889916284255e-06, - "loss": 0.8914, - "step": 16 - }, - { - "epoch": 0.0918918918918919, - "grad_norm": 3.794267416000366, - "learning_rate": 4.999077259571442e-06, - "loss": 1.0176, - "step": 17 - }, - { - "epoch": 0.0972972972972973, - "grad_norm": 4.788509845733643, - "learning_rate": 4.998958320824031e-06, - "loss": 1.0259, - "step": 18 - }, - { - "epoch": 0.10270270270270271, - "grad_norm": 10.027527809143066, - "learning_rate": 4.998832175729179e-06, - "loss": 1.3356, - "step": 19 - }, - { - "epoch": 0.10810810810810811, - "grad_norm": 4.612483978271484, - "learning_rate": 4.998698824650656e-06, - "loss": 1.4486, - "step": 20 - }, - { - "epoch": 0.11351351351351352, - "grad_norm": 3.8676936626434326, - "learning_rate": 4.998558267973014e-06, - "loss": 0.8372, - "step": 21 - }, - { - "epoch": 0.11891891891891893, - "grad_norm": 2.9611001014709473, - "learning_rate": 4.998410506101579e-06, - "loss": 0.7931, - "step": 22 - }, - { - "epoch": 0.12432432432432433, - "grad_norm": 5.508745193481445, - "learning_rate": 4.9982555394624595e-06, - "loss": 1.3022, - "step": 23 - }, - { - "epoch": 0.12972972972972974, - "grad_norm": 3.434845209121704, - "learning_rate": 4.998093368502539e-06, - "loss": 0.9739, - "step": 24 - }, - { - "epoch": 0.13513513513513514, - "grad_norm": 4.736802101135254, - "learning_rate": 4.9979239936894765e-06, - "loss": 1.1154, - "step": 25 - }, - { - "epoch": 0.14054054054054055, - "grad_norm": 3.69411039352417, - "learning_rate": 4.997747415511705e-06, - "loss": 0.7543, - "step": 26 - }, - { - "epoch": 0.14594594594594595, - "grad_norm": 2.8646645545959473, - "learning_rate": 4.997563634478428e-06, - "loss": 0.7278, - "step": 27 - }, - { - "epoch": 0.15135135135135136, - "grad_norm": 6.56904935836792, - "learning_rate": 4.997372651119626e-06, - "loss": 0.8167, - "step": 28 - }, - { - "epoch": 0.15675675675675677, - "grad_norm": 2.955914258956909, - "learning_rate": 4.997174465986044e-06, - "loss": 0.8031, - "step": 29 - }, - { - "epoch": 0.16216216216216217, - "grad_norm": 2.5714259147644043, - "learning_rate": 4.996969079649196e-06, - "loss": 0.689, - "step": 30 - }, - { - "epoch": 0.16756756756756758, - "grad_norm": 3.5165364742279053, - "learning_rate": 4.996756492701362e-06, - "loss": 0.8059, - "step": 31 - }, - { - "epoch": 0.17297297297297298, - "grad_norm": 3.2861921787261963, - "learning_rate": 4.996536705755591e-06, - "loss": 0.9658, - "step": 32 - }, - { - "epoch": 0.1783783783783784, - "grad_norm": 2.962470531463623, - "learning_rate": 4.996309719445687e-06, - "loss": 0.8349, - "step": 33 - }, - { - "epoch": 0.1837837837837838, - "grad_norm": 2.7694804668426514, - "learning_rate": 4.996075534426223e-06, - "loss": 0.8287, - "step": 34 - }, - { - "epoch": 0.1891891891891892, - "grad_norm": 3.405071258544922, - "learning_rate": 4.995834151372526e-06, - "loss": 1.1211, - "step": 35 - }, - { - "epoch": 0.1945945945945946, - "grad_norm": 2.8680710792541504, - "learning_rate": 4.995585570980685e-06, - "loss": 1.0841, - "step": 36 - }, - { - "epoch": 0.2, - "grad_norm": 3.341021776199341, - "learning_rate": 4.995329793967537e-06, - "loss": 0.6182, - "step": 37 - }, - { - "epoch": 0.20540540540540542, - "grad_norm": 3.0639379024505615, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.7647, - "step": 38 - }, - { - "epoch": 0.21081081081081082, - "grad_norm": 3.225759983062744, - "learning_rate": 4.994796653048457e-06, - "loss": 0.8691, - "step": 39 - }, - { - "epoch": 0.21621621621621623, - "grad_norm": 4.56926155090332, - "learning_rate": 4.994519290679965e-06, - "loss": 1.0404, - "step": 40 - }, - { - "epoch": 0.22162162162162163, - "grad_norm": 4.871571063995361, - "learning_rate": 4.994234734765043e-06, - "loss": 1.1877, - "step": 41 - }, - { - "epoch": 0.22702702702702704, - "grad_norm": 3.672215700149536, - "learning_rate": 4.993942986124278e-06, - "loss": 0.959, - "step": 42 - }, - { - "epoch": 0.23243243243243245, - "grad_norm": 3.184683322906494, - "learning_rate": 4.9936440455989975e-06, - "loss": 0.9249, - "step": 43 - }, - { - "epoch": 0.23783783783783785, - "grad_norm": 2.7092034816741943, - "learning_rate": 4.993337914051266e-06, - "loss": 0.6899, - "step": 44 - }, - { - "epoch": 0.24324324324324326, - "grad_norm": 3.153764486312866, - "learning_rate": 4.99302459236389e-06, - "loss": 0.9075, - "step": 45 - }, - { - "epoch": 0.24864864864864866, - "grad_norm": 3.3629748821258545, - "learning_rate": 4.992704081440407e-06, - "loss": 0.785, - "step": 46 - }, - { - "epoch": 0.25405405405405407, - "grad_norm": 4.478365898132324, - "learning_rate": 4.992376382205088e-06, - "loss": 1.008, - "step": 47 - }, - { - "epoch": 0.2594594594594595, - "grad_norm": 3.4001641273498535, - "learning_rate": 4.992041495602932e-06, - "loss": 0.7751, - "step": 48 - }, - { - "epoch": 0.2648648648648649, - "grad_norm": 2.522662878036499, - "learning_rate": 4.991699422599664e-06, - "loss": 0.9022, - "step": 49 - }, - { - "epoch": 0.2702702702702703, - "grad_norm": 2.764458179473877, - "learning_rate": 4.991350164181735e-06, - "loss": 0.8801, - "step": 50 - }, - { - "epoch": 0.2756756756756757, - "grad_norm": 2.814859628677368, - "learning_rate": 4.990993721356317e-06, - "loss": 0.7045, - "step": 51 - }, - { - "epoch": 0.2810810810810811, - "grad_norm": 2.441311836242676, - "learning_rate": 4.990630095151296e-06, - "loss": 0.7312, - "step": 52 - }, - { - "epoch": 0.2864864864864865, - "grad_norm": 2.4443013668060303, - "learning_rate": 4.9902592866152765e-06, - "loss": 0.9609, - "step": 53 - }, - { - "epoch": 0.2918918918918919, - "grad_norm": 2.2934701442718506, - "learning_rate": 4.989881296817575e-06, - "loss": 0.5753, - "step": 54 - }, - { - "epoch": 0.2972972972972973, - "grad_norm": 2.6286847591400146, - "learning_rate": 4.989496126848215e-06, - "loss": 0.5118, - "step": 55 - }, - { - "epoch": 0.3027027027027027, - "grad_norm": 3.6817069053649902, - "learning_rate": 4.989103777817928e-06, - "loss": 1.1261, - "step": 56 - }, - { - "epoch": 0.3081081081081081, - "grad_norm": 3.011197566986084, - "learning_rate": 4.988704250858145e-06, - "loss": 0.7823, - "step": 57 - }, - { - "epoch": 0.31351351351351353, - "grad_norm": 2.5490806102752686, - "learning_rate": 4.988297547121e-06, - "loss": 0.6019, - "step": 58 - }, - { - "epoch": 0.31891891891891894, - "grad_norm": 3.0803146362304688, - "learning_rate": 4.98788366777932e-06, - "loss": 0.825, - "step": 59 - }, - { - "epoch": 0.32432432432432434, - "grad_norm": 3.015730619430542, - "learning_rate": 4.987462614026625e-06, - "loss": 0.7667, - "step": 60 - }, - { - "epoch": 0.32972972972972975, - "grad_norm": 2.5371594429016113, - "learning_rate": 4.987034387077126e-06, - "loss": 0.8051, - "step": 61 - }, - { - "epoch": 0.33513513513513515, - "grad_norm": 2.6414010524749756, - "learning_rate": 4.986598988165718e-06, - "loss": 0.6895, - "step": 62 - }, - { - "epoch": 0.34054054054054056, - "grad_norm": 3.065131187438965, - "learning_rate": 4.9861564185479785e-06, - "loss": 0.9268, - "step": 63 - }, - { - "epoch": 0.34594594594594597, - "grad_norm": 2.5708694458007812, - "learning_rate": 4.985706679500163e-06, - "loss": 0.9854, - "step": 64 - }, - { - "epoch": 0.35135135135135137, - "grad_norm": 2.768915891647339, - "learning_rate": 4.9852497723192025e-06, - "loss": 0.8083, - "step": 65 - }, - { - "epoch": 0.3567567567567568, - "grad_norm": 2.567901849746704, - "learning_rate": 4.9847856983227e-06, - "loss": 0.9098, - "step": 66 - }, - { - "epoch": 0.3621621621621622, - "grad_norm": 2.5766549110412598, - "learning_rate": 4.984314458848923e-06, - "loss": 0.8881, - "step": 67 - }, - { - "epoch": 0.3675675675675676, - "grad_norm": 2.9778389930725098, - "learning_rate": 4.983836055256804e-06, - "loss": 0.9877, - "step": 68 - }, - { - "epoch": 0.372972972972973, - "grad_norm": 2.7225165367126465, - "learning_rate": 4.983350488925935e-06, - "loss": 0.8282, - "step": 69 - }, - { - "epoch": 0.3783783783783784, - "grad_norm": 2.702287197113037, - "learning_rate": 4.982857761256564e-06, - "loss": 1.1756, - "step": 70 - }, - { - "epoch": 0.3837837837837838, - "grad_norm": 2.9815568923950195, - "learning_rate": 4.982357873669589e-06, - "loss": 0.8114, - "step": 71 - }, - { - "epoch": 0.3891891891891892, - "grad_norm": 3.27150297164917, - "learning_rate": 4.981850827606556e-06, - "loss": 0.6763, - "step": 72 - }, - { - "epoch": 0.3945945945945946, - "grad_norm": 2.568423271179199, - "learning_rate": 4.981336624529655e-06, - "loss": 0.9372, - "step": 73 - }, - { - "epoch": 0.4, - "grad_norm": 2.621175527572632, - "learning_rate": 4.980815265921714e-06, - "loss": 1.0155, - "step": 74 - }, - { - "epoch": 0.40540540540540543, - "grad_norm": 2.62827205657959, - "learning_rate": 4.980286753286196e-06, - "loss": 0.949, - "step": 75 - }, - { - "epoch": 0.41081081081081083, - "grad_norm": 2.9462146759033203, - "learning_rate": 4.979751088147192e-06, - "loss": 1.0134, - "step": 76 - }, - { - "epoch": 0.41621621621621624, - "grad_norm": 2.814852714538574, - "learning_rate": 4.979208272049425e-06, - "loss": 0.9722, - "step": 77 - }, - { - "epoch": 0.42162162162162165, - "grad_norm": 4.177679538726807, - "learning_rate": 4.978658306558235e-06, - "loss": 1.2259, - "step": 78 - }, - { - "epoch": 0.42702702702702705, - "grad_norm": 2.813084125518799, - "learning_rate": 4.978101193259578e-06, - "loss": 0.834, - "step": 79 - }, - { - "epoch": 0.43243243243243246, - "grad_norm": 2.71824049949646, - "learning_rate": 4.977536933760025e-06, - "loss": 0.6151, - "step": 80 - }, - { - "epoch": 0.43783783783783786, - "grad_norm": 4.992153167724609, - "learning_rate": 4.976965529686755e-06, - "loss": 1.0475, - "step": 81 - }, - { - "epoch": 0.44324324324324327, - "grad_norm": 2.4810822010040283, - "learning_rate": 4.976386982687548e-06, - "loss": 0.8324, - "step": 82 - }, - { - "epoch": 0.4486486486486487, - "grad_norm": 4.509149074554443, - "learning_rate": 4.9758012944307845e-06, - "loss": 0.997, - "step": 83 - }, - { - "epoch": 0.4540540540540541, - "grad_norm": 3.114325761795044, - "learning_rate": 4.975208466605436e-06, - "loss": 1.2024, - "step": 84 - }, - { - "epoch": 0.4594594594594595, - "grad_norm": 3.297091007232666, - "learning_rate": 4.974608500921064e-06, - "loss": 0.9146, - "step": 85 - }, - { - "epoch": 0.4648648648648649, - "grad_norm": 2.824475049972534, - "learning_rate": 4.974001399107816e-06, - "loss": 0.7181, - "step": 86 - }, - { - "epoch": 0.4702702702702703, - "grad_norm": 20.262290954589844, - "learning_rate": 4.973387162916415e-06, - "loss": 0.8599, - "step": 87 - }, - { - "epoch": 0.4756756756756757, - "grad_norm": 4.015744686126709, - "learning_rate": 4.972765794118158e-06, - "loss": 0.6081, - "step": 88 - }, - { - "epoch": 0.4810810810810811, - "grad_norm": 2.8033058643341064, - "learning_rate": 4.9721372945049114e-06, - "loss": 0.8764, - "step": 89 - }, - { - "epoch": 0.4864864864864865, - "grad_norm": 5.271846294403076, - "learning_rate": 4.971501665889107e-06, - "loss": 0.8622, - "step": 90 - }, - { - "epoch": 0.4918918918918919, - "grad_norm": 2.557264804840088, - "learning_rate": 4.9708589101037306e-06, - "loss": 0.5523, - "step": 91 - }, - { - "epoch": 0.4972972972972973, - "grad_norm": 4.342173099517822, - "learning_rate": 4.970209029002325e-06, - "loss": 0.8922, - "step": 92 - }, - { - "epoch": 0.5027027027027027, - "grad_norm": 2.950364351272583, - "learning_rate": 4.969552024458977e-06, - "loss": 0.9455, - "step": 93 - }, - { - "epoch": 0.5081081081081081, - "grad_norm": 2.6453042030334473, - "learning_rate": 4.968887898368318e-06, - "loss": 0.8342, - "step": 94 - }, - { - "epoch": 0.5135135135135135, - "grad_norm": 3.486766815185547, - "learning_rate": 4.968216652645515e-06, - "loss": 0.8476, - "step": 95 - }, - { - "epoch": 0.518918918918919, - "grad_norm": 2.884152889251709, - "learning_rate": 4.967538289226268e-06, - "loss": 0.8879, - "step": 96 - }, - { - "epoch": 0.5243243243243243, - "grad_norm": 2.4130594730377197, - "learning_rate": 4.966852810066798e-06, - "loss": 0.7114, - "step": 97 - }, - { - "epoch": 0.5297297297297298, - "grad_norm": 3.182410955429077, - "learning_rate": 4.9661602171438524e-06, - "loss": 0.6757, - "step": 98 - }, - { - "epoch": 0.5351351351351351, - "grad_norm": 2.5027542114257812, - "learning_rate": 4.965460512454687e-06, - "loss": 0.8029, - "step": 99 - }, - { - "epoch": 0.5405405405405406, - "grad_norm": 2.3096024990081787, - "learning_rate": 4.964753698017071e-06, - "loss": 0.842, - "step": 100 - }, - { - "epoch": 0.5459459459459459, - "grad_norm": 2.875657081604004, - "learning_rate": 4.964039775869271e-06, - "loss": 0.6339, - "step": 101 - }, - { - "epoch": 0.5513513513513514, - "grad_norm": 2.505406141281128, - "learning_rate": 4.963318748070056e-06, - "loss": 0.7743, - "step": 102 - }, - { - "epoch": 0.5567567567567567, - "grad_norm": 3.552562713623047, - "learning_rate": 4.9625906166986815e-06, - "loss": 0.926, - "step": 103 - }, - { - "epoch": 0.5621621621621622, - "grad_norm": 2.717942476272583, - "learning_rate": 4.961855383854889e-06, - "loss": 0.7037, - "step": 104 - }, - { - "epoch": 0.5675675675675675, - "grad_norm": 2.5049386024475098, - "learning_rate": 4.961113051658901e-06, - "loss": 0.561, - "step": 105 - }, - { - "epoch": 0.572972972972973, - "grad_norm": 2.3112900257110596, - "learning_rate": 4.96036362225141e-06, - "loss": 0.7316, - "step": 106 - }, - { - "epoch": 0.5783783783783784, - "grad_norm": 2.470257520675659, - "learning_rate": 4.959607097793575e-06, - "loss": 0.6426, - "step": 107 - }, - { - "epoch": 0.5837837837837838, - "grad_norm": 3.8040788173675537, - "learning_rate": 4.9588434804670176e-06, - "loss": 1.0044, - "step": 108 - }, - { - "epoch": 0.5891891891891892, - "grad_norm": 3.143547296524048, - "learning_rate": 4.958072772473812e-06, - "loss": 0.9219, - "step": 109 - }, - { - "epoch": 0.5945945945945946, - "grad_norm": 3.5052590370178223, - "learning_rate": 4.9572949760364795e-06, - "loss": 0.6056, - "step": 110 - }, - { - "epoch": 0.6, - "grad_norm": 3.064009428024292, - "learning_rate": 4.9565100933979835e-06, - "loss": 0.6346, - "step": 111 - }, - { - "epoch": 0.6054054054054054, - "grad_norm": 2.694610595703125, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.9856, - "step": 112 - }, - { - "epoch": 0.6108108108108108, - "grad_norm": 2.5885775089263916, - "learning_rate": 4.954919078591521e-06, - "loss": 0.8669, - "step": 113 - }, - { - "epoch": 0.6162162162162163, - "grad_norm": 2.593609571456909, - "learning_rate": 4.954112951011628e-06, - "loss": 0.7201, - "step": 114 - }, - { - "epoch": 0.6216216216216216, - "grad_norm": 3.3045759201049805, - "learning_rate": 4.9532997464067065e-06, - "loss": 0.9095, - "step": 115 - }, - { - "epoch": 0.6270270270270271, - "grad_norm": 2.8144869804382324, - "learning_rate": 4.952479467121828e-06, - "loss": 1.0213, - "step": 116 - }, - { - "epoch": 0.6324324324324324, - "grad_norm": 2.5460312366485596, - "learning_rate": 4.951652115522463e-06, - "loss": 1.1154, - "step": 117 - }, - { - "epoch": 0.6378378378378379, - "grad_norm": 2.795137405395508, - "learning_rate": 4.950817693994481e-06, - "loss": 0.691, - "step": 118 - }, - { - "epoch": 0.6432432432432432, - "grad_norm": 2.4979195594787598, - "learning_rate": 4.949976204944135e-06, - "loss": 0.7224, - "step": 119 - }, - { - "epoch": 0.6486486486486487, - "grad_norm": 3.3131983280181885, - "learning_rate": 4.949127650798063e-06, - "loss": 0.9256, - "step": 120 - }, - { - "epoch": 0.654054054054054, - "grad_norm": 2.9060285091400146, - "learning_rate": 4.948272034003275e-06, - "loss": 0.6892, - "step": 121 - }, - { - "epoch": 0.6594594594594595, - "grad_norm": 3.695594549179077, - "learning_rate": 4.947409357027148e-06, - "loss": 0.5878, - "step": 122 - }, - { - "epoch": 0.6648648648648648, - "grad_norm": 3.1250460147857666, - "learning_rate": 4.9465396223574165e-06, - "loss": 0.9904, - "step": 123 - }, - { - "epoch": 0.6702702702702703, - "grad_norm": 4.024891376495361, - "learning_rate": 4.945662832502172e-06, - "loss": 1.1592, - "step": 124 - }, - { - "epoch": 0.6756756756756757, - "grad_norm": 2.6886494159698486, - "learning_rate": 4.944778989989847e-06, - "loss": 1.0041, - "step": 125 - }, - { - "epoch": 0.6810810810810811, - "grad_norm": 2.366912841796875, - "learning_rate": 4.943888097369216e-06, - "loss": 0.7045, - "step": 126 - }, - { - "epoch": 0.6864864864864865, - "grad_norm": 2.394932270050049, - "learning_rate": 4.942990157209381e-06, - "loss": 0.6685, - "step": 127 - }, - { - "epoch": 0.6918918918918919, - "grad_norm": 2.61933970451355, - "learning_rate": 4.9420851720997674e-06, - "loss": 0.8812, - "step": 128 - }, - { - "epoch": 0.6972972972972973, - "grad_norm": 2.7395646572113037, - "learning_rate": 4.94117314465012e-06, - "loss": 1.3014, - "step": 129 - }, - { - "epoch": 0.7027027027027027, - "grad_norm": 3.065484046936035, - "learning_rate": 4.940254077490487e-06, - "loss": 0.6978, - "step": 130 - }, - { - "epoch": 0.7081081081081081, - "grad_norm": 2.895038366317749, - "learning_rate": 4.939327973271222e-06, - "loss": 0.6249, - "step": 131 - }, - { - "epoch": 0.7135135135135136, - "grad_norm": 3.1773312091827393, - "learning_rate": 4.9383948346629665e-06, - "loss": 0.6423, - "step": 132 - }, - { - "epoch": 0.7189189189189189, - "grad_norm": 2.2378008365631104, - "learning_rate": 4.937454664356652e-06, - "loss": 0.7193, - "step": 133 - }, - { - "epoch": 0.7243243243243244, - "grad_norm": 2.5673701763153076, - "learning_rate": 4.9365074650634855e-06, - "loss": 0.7065, - "step": 134 - }, - { - "epoch": 0.7297297297297297, - "grad_norm": 2.7348387241363525, - "learning_rate": 4.9355532395149445e-06, - "loss": 1.0046, - "step": 135 - }, - { - "epoch": 0.7351351351351352, - "grad_norm": 2.391741991043091, - "learning_rate": 4.9345919904627655e-06, - "loss": 0.6771, - "step": 136 - }, - { - "epoch": 0.7405405405405405, - "grad_norm": 2.2096705436706543, - "learning_rate": 4.933623720678944e-06, - "loss": 0.6589, - "step": 137 - }, - { - "epoch": 0.745945945945946, - "grad_norm": 3.0840072631835938, - "learning_rate": 4.932648432955718e-06, - "loss": 0.8755, - "step": 138 - }, - { - "epoch": 0.7513513513513513, - "grad_norm": 2.4970428943634033, - "learning_rate": 4.931666130105564e-06, - "loss": 0.6685, - "step": 139 - }, - { - "epoch": 0.7567567567567568, - "grad_norm": 4.315455436706543, - "learning_rate": 4.930676814961189e-06, - "loss": 0.8101, - "step": 140 - }, - { - "epoch": 0.7621621621621621, - "grad_norm": 5.388065814971924, - "learning_rate": 4.92968049037552e-06, - "loss": 0.8193, - "step": 141 - }, - { - "epoch": 0.7675675675675676, - "grad_norm": 2.6107139587402344, - "learning_rate": 4.9286771592217005e-06, - "loss": 0.7852, - "step": 142 - }, - { - "epoch": 0.772972972972973, - "grad_norm": 3.936556577682495, - "learning_rate": 4.927666824393076e-06, - "loss": 1.0388, - "step": 143 - }, - { - "epoch": 0.7783783783783784, - "grad_norm": 2.74424409866333, - "learning_rate": 4.926649488803191e-06, - "loss": 0.8266, - "step": 144 - }, - { - "epoch": 0.7837837837837838, - "grad_norm": 2.8998451232910156, - "learning_rate": 4.925625155385776e-06, - "loss": 0.4895, - "step": 145 - }, - { - "epoch": 0.7891891891891892, - "grad_norm": 3.0631520748138428, - "learning_rate": 4.924593827094743e-06, - "loss": 0.8759, - "step": 146 - }, - { - "epoch": 0.7945945945945946, - "grad_norm": 3.233267307281494, - "learning_rate": 4.923555506904176e-06, - "loss": 0.701, - "step": 147 - }, - { - "epoch": 0.8, - "grad_norm": 2.87701416015625, - "learning_rate": 4.922510197808321e-06, - "loss": 1.1327, - "step": 148 - }, - { - "epoch": 0.8054054054054054, - "grad_norm": 3.650576114654541, - "learning_rate": 4.921457902821578e-06, - "loss": 0.7587, - "step": 149 - }, - { - "epoch": 0.8108108108108109, - "grad_norm": 3.232112407684326, - "learning_rate": 4.920398624978493e-06, - "loss": 1.2158, - "step": 150 - }, - { - "epoch": 0.8162162162162162, - "grad_norm": 2.468384027481079, - "learning_rate": 4.919332367333748e-06, - "loss": 0.6852, - "step": 151 - }, - { - "epoch": 0.8216216216216217, - "grad_norm": 2.5947415828704834, - "learning_rate": 4.918259132962154e-06, - "loss": 0.6611, - "step": 152 - }, - { - "epoch": 0.827027027027027, - "grad_norm": 3.0171427726745605, - "learning_rate": 4.917178924958638e-06, - "loss": 0.7327, - "step": 153 - }, - { - "epoch": 0.8324324324324325, - "grad_norm": 3.293184518814087, - "learning_rate": 4.916091746438243e-06, - "loss": 0.8528, - "step": 154 - }, - { - "epoch": 0.8378378378378378, - "grad_norm": 4.0570969581604, - "learning_rate": 4.9149976005361085e-06, - "loss": 0.9141, - "step": 155 - }, - { - "epoch": 0.8432432432432433, - "grad_norm": 2.8782784938812256, - "learning_rate": 4.913896490407467e-06, - "loss": 1.1132, - "step": 156 - }, - { - "epoch": 0.8486486486486486, - "grad_norm": 2.5671517848968506, - "learning_rate": 4.912788419227635e-06, - "loss": 0.7587, - "step": 157 - }, - { - "epoch": 0.8540540540540541, - "grad_norm": 2.9445390701293945, - "learning_rate": 4.911673390192002e-06, - "loss": 0.9227, - "step": 158 - }, - { - "epoch": 0.8594594594594595, - "grad_norm": 2.472595453262329, - "learning_rate": 4.910551406516023e-06, - "loss": 0.8154, - "step": 159 - }, - { - "epoch": 0.8648648648648649, - "grad_norm": 2.5233397483825684, - "learning_rate": 4.909422471435207e-06, - "loss": 0.9897, - "step": 160 - }, - { - "epoch": 0.8702702702702703, - "grad_norm": 3.3919546604156494, - "learning_rate": 4.90828658820511e-06, - "loss": 0.6162, - "step": 161 - }, - { - "epoch": 0.8756756756756757, - "grad_norm": 3.060908555984497, - "learning_rate": 4.907143760101325e-06, - "loss": 0.5734, - "step": 162 - }, - { - "epoch": 0.8810810810810811, - "grad_norm": 3.4584782123565674, - "learning_rate": 4.905993990419472e-06, - "loss": 0.8328, - "step": 163 - }, - { - "epoch": 0.8864864864864865, - "grad_norm": 2.936570644378662, - "learning_rate": 4.904837282475187e-06, - "loss": 0.6787, - "step": 164 - }, - { - "epoch": 0.8918918918918919, - "grad_norm": 2.564837694168091, - "learning_rate": 4.9036736396041165e-06, - "loss": 0.9658, - "step": 165 - }, - { - "epoch": 0.8972972972972973, - "grad_norm": 3.2509360313415527, - "learning_rate": 4.902503065161905e-06, - "loss": 0.7899, - "step": 166 - }, - { - "epoch": 0.9027027027027027, - "grad_norm": 2.9730329513549805, - "learning_rate": 4.901325562524185e-06, - "loss": 0.9476, - "step": 167 - }, - { - "epoch": 0.9081081081081082, - "grad_norm": 3.044980049133301, - "learning_rate": 4.900141135086569e-06, - "loss": 0.7589, - "step": 168 - }, - { - "epoch": 0.9135135135135135, - "grad_norm": 3.030585527420044, - "learning_rate": 4.898949786264638e-06, - "loss": 0.6724, - "step": 169 - }, - { - "epoch": 0.918918918918919, - "grad_norm": 2.249122142791748, - "learning_rate": 4.897751519493933e-06, - "loss": 0.6968, - "step": 170 - }, - { - "epoch": 0.9243243243243243, - "grad_norm": 2.9816982746124268, - "learning_rate": 4.896546338229945e-06, - "loss": 0.7984, - "step": 171 - }, - { - "epoch": 0.9297297297297298, - "grad_norm": 2.415736675262451, - "learning_rate": 4.8953342459481034e-06, - "loss": 0.6109, - "step": 172 - }, - { - "epoch": 0.9351351351351351, - "grad_norm": 2.740518808364868, - "learning_rate": 4.894115246143768e-06, - "loss": 0.8126, - "step": 173 - }, - { - "epoch": 0.9405405405405406, - "grad_norm": 2.7610201835632324, - "learning_rate": 4.892889342332218e-06, - "loss": 0.6862, - "step": 174 - }, - { - "epoch": 0.9459459459459459, - "grad_norm": 3.057025194168091, - "learning_rate": 4.891656538048642e-06, - "loss": 0.9895, - "step": 175 - }, - { - "epoch": 0.9513513513513514, - "grad_norm": 2.569751262664795, - "learning_rate": 4.890416836848128e-06, - "loss": 0.8481, - "step": 176 - }, - { - "epoch": 0.9567567567567568, - "grad_norm": 2.4443397521972656, - "learning_rate": 4.889170242305652e-06, - "loss": 0.6478, - "step": 177 - }, - { - "epoch": 0.9621621621621622, - "grad_norm": 2.5009846687316895, - "learning_rate": 4.887916758016069e-06, - "loss": 0.9714, - "step": 178 - }, - { - "epoch": 0.9675675675675676, - "grad_norm": 3.101975202560425, - "learning_rate": 4.886656387594104e-06, - "loss": 1.1264, - "step": 179 - }, - { - "epoch": 0.972972972972973, - "grad_norm": 2.6144704818725586, - "learning_rate": 4.885389134674338e-06, - "loss": 0.7664, - "step": 180 - }, - { - "epoch": 0.9783783783783784, - "grad_norm": 2.5834381580352783, - "learning_rate": 4.884115002911197e-06, - "loss": 0.6131, - "step": 181 - }, - { - "epoch": 0.9837837837837838, - "grad_norm": 2.5378055572509766, - "learning_rate": 4.88283399597895e-06, - "loss": 0.8733, - "step": 182 - }, - { - "epoch": 0.9891891891891892, - "grad_norm": 2.4095377922058105, - "learning_rate": 4.881546117571686e-06, - "loss": 0.643, - "step": 183 - }, - { - "epoch": 0.9945945945945946, - "grad_norm": 2.9554507732391357, - "learning_rate": 4.8802513714033135e-06, - "loss": 0.7287, - "step": 184 - }, - { - "epoch": 1.0, - "grad_norm": 2.8279213905334473, - "learning_rate": 4.878949761207545e-06, - "loss": 0.9927, - "step": 185 - } - ], - "logging_steps": 1, - "max_steps": 1850, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.993221351650099e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00001-of-00007.safetensors deleted file mode 100644 index e67068de80bb8ec4188c3f6adfca9516e5ee977f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a395101f0879d45a9c5005c1f0d71874db322d82bca4b2667f0ccaa1ce8fc8c8 -size 4886466168 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00002-of-00007.safetensors deleted file mode 100644 index 69b7ffee527361bf3be1f8daebda614025adbda7..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b41930cc10d126ffeeefe51ac840f4061b8cf7254c73cd54a01e954995de6951 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00003-of-00007.safetensors deleted file mode 100644 index 1cc0c84f2f51475dbe2460e1c64f48ca01c4ef33..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d4d2e9fa29e3a81d36fae6dbeff97d87d766a84a25a0ba2b208299f4afe8bed1 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00004-of-00007.safetensors deleted file mode 100644 index c8bc3f0a95c7860cac8e4bd70a003b51b26883e0..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d54b6d5909a7804a69fcae82a9538df63d6a958c9936e96655640b5fdc490dbd -size 4999813128 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00005-of-00007.safetensors deleted file mode 100644 index 7c3119d8cddcb589018622ace6f8ecc304de5da6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05a076cfa2a65b225cd8192ab06c4c607316707bdb2dfa377bd7de68240a517b -size 4832007496 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00006-of-00007.safetensors deleted file mode 100644 index e1129b7062625a5170904c9ef3d91618cbd1e9e2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c5f1db1488632419f303bb264a6f2a442acc65b2e464f6b56639cd5530b2c45b -size 4999813120 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00007-of-00007.safetensors deleted file mode 100644 index 10400ee20b397b4f34f2b3a42995dc474661ce8d..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32d78a993463e1a292756f5a76bcbdeca2f2f528e16fe761b54efe35afde79e8 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_0.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_0.pth deleted file mode 100644 index 37ac50652a3badbfb1bdeaccb8b1934575b584eb..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_1.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_1.pth deleted file mode 100644 index 0bc3650851dae439677613c9e23a5528de47b679..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_2.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_2.pth deleted file mode 100644 index 0e00a6e8b4b743026f68d749a8cb3bdd4b746838..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_3.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_3.pth deleted file mode 100644 index 5354141d42e077c356f9ca8c6b12bd7e5e41f2af..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/scheduler.pt b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/scheduler.pt deleted file mode 100644 index 58e8893ea42660fa148ccdeb95afdc49883a8d5e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:654e08f75dc5ac737e2e840469406b912dc2eed4d185fa9df2c87e5993bdec4d -size 1064 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-370/trainer_state.json deleted file mode 100644 index e3a3ec16e2a61ab89bce56c0f273b7c43363722c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-370/trainer_state.json +++ /dev/null @@ -1,2624 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.0, - "eval_steps": 500, - "global_step": 370, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.005405405405405406, - "grad_norm": 72.60939025878906, - "learning_rate": 5e-06, - "loss": 2.9165, - "step": 1 - }, - { - "epoch": 0.010810810810810811, - "grad_norm": 29.01830291748047, - "learning_rate": 4.999996395324314e-06, - "loss": 1.9314, - "step": 2 - }, - { - "epoch": 0.016216216216216217, - "grad_norm": 21.44908332824707, - "learning_rate": 4.99998558130765e-06, - "loss": 1.5709, - "step": 3 - }, - { - "epoch": 0.021621621621621623, - "grad_norm": 4.490907669067383, - "learning_rate": 4.999967557981192e-06, - "loss": 0.8099, - "step": 4 - }, - { - "epoch": 0.02702702702702703, - "grad_norm": 4.000796794891357, - "learning_rate": 4.999942325396917e-06, - "loss": 0.9021, - "step": 5 - }, - { - "epoch": 0.032432432432432434, - "grad_norm": 18.513282775878906, - "learning_rate": 4.999909883627588e-06, - "loss": 1.7972, - "step": 6 - }, - { - "epoch": 0.03783783783783784, - "grad_norm": 3.5735981464385986, - "learning_rate": 4.999870232766757e-06, - "loss": 1.4306, - "step": 7 - }, - { - "epoch": 0.043243243243243246, - "grad_norm": 3.1145193576812744, - "learning_rate": 4.9998233729287696e-06, - "loss": 1.051, - "step": 8 - }, - { - "epoch": 0.04864864864864865, - "grad_norm": 3.856376886367798, - "learning_rate": 4.999769304248755e-06, - "loss": 0.8089, - "step": 9 - }, - { - "epoch": 0.05405405405405406, - "grad_norm": 4.05589485168457, - "learning_rate": 4.9997080268826344e-06, - "loss": 1.0999, - "step": 10 - }, - { - "epoch": 0.05945945945945946, - "grad_norm": 13.784229278564453, - "learning_rate": 4.9996395410071165e-06, - "loss": 1.2831, - "step": 11 - }, - { - "epoch": 0.06486486486486487, - "grad_norm": 6.079237937927246, - "learning_rate": 4.999563846819696e-06, - "loss": 1.2874, - "step": 12 - }, - { - "epoch": 0.07027027027027027, - "grad_norm": 4.5971245765686035, - "learning_rate": 4.999480944538655e-06, - "loss": 0.96, - "step": 13 - }, - { - "epoch": 0.07567567567567568, - "grad_norm": 4.916017532348633, - "learning_rate": 4.999390834403063e-06, - "loss": 0.9869, - "step": 14 - }, - { - "epoch": 0.08108108108108109, - "grad_norm": 3.2311055660247803, - "learning_rate": 4.999293516672773e-06, - "loss": 0.9293, - "step": 15 - }, - { - "epoch": 0.08648648648648649, - "grad_norm": 3.3040921688079834, - "learning_rate": 4.9991889916284255e-06, - "loss": 0.8914, - "step": 16 - }, - { - "epoch": 0.0918918918918919, - "grad_norm": 3.794267416000366, - "learning_rate": 4.999077259571442e-06, - "loss": 1.0176, - "step": 17 - }, - { - "epoch": 0.0972972972972973, - "grad_norm": 4.788509845733643, - "learning_rate": 4.998958320824031e-06, - "loss": 1.0259, - "step": 18 - }, - { - "epoch": 0.10270270270270271, - "grad_norm": 10.027527809143066, - "learning_rate": 4.998832175729179e-06, - "loss": 1.3356, - "step": 19 - }, - { - "epoch": 0.10810810810810811, - "grad_norm": 4.612483978271484, - "learning_rate": 4.998698824650656e-06, - "loss": 1.4486, - "step": 20 - }, - { - "epoch": 0.11351351351351352, - "grad_norm": 3.8676936626434326, - "learning_rate": 4.998558267973014e-06, - "loss": 0.8372, - "step": 21 - }, - { - "epoch": 0.11891891891891893, - "grad_norm": 2.9611001014709473, - "learning_rate": 4.998410506101579e-06, - "loss": 0.7931, - "step": 22 - }, - { - "epoch": 0.12432432432432433, - "grad_norm": 5.508745193481445, - "learning_rate": 4.9982555394624595e-06, - "loss": 1.3022, - "step": 23 - }, - { - "epoch": 0.12972972972972974, - "grad_norm": 3.434845209121704, - "learning_rate": 4.998093368502539e-06, - "loss": 0.9739, - "step": 24 - }, - { - "epoch": 0.13513513513513514, - "grad_norm": 4.736802101135254, - "learning_rate": 4.9979239936894765e-06, - "loss": 1.1154, - "step": 25 - }, - { - "epoch": 0.14054054054054055, - "grad_norm": 3.69411039352417, - "learning_rate": 4.997747415511705e-06, - "loss": 0.7543, - "step": 26 - }, - { - "epoch": 0.14594594594594595, - "grad_norm": 2.8646645545959473, - "learning_rate": 4.997563634478428e-06, - "loss": 0.7278, - "step": 27 - }, - { - "epoch": 0.15135135135135136, - "grad_norm": 6.56904935836792, - "learning_rate": 4.997372651119626e-06, - "loss": 0.8167, - "step": 28 - }, - { - "epoch": 0.15675675675675677, - "grad_norm": 2.955914258956909, - "learning_rate": 4.997174465986044e-06, - "loss": 0.8031, - "step": 29 - }, - { - "epoch": 0.16216216216216217, - "grad_norm": 2.5714259147644043, - "learning_rate": 4.996969079649196e-06, - "loss": 0.689, - "step": 30 - }, - { - "epoch": 0.16756756756756758, - "grad_norm": 3.5165364742279053, - "learning_rate": 4.996756492701362e-06, - "loss": 0.8059, - "step": 31 - }, - { - "epoch": 0.17297297297297298, - "grad_norm": 3.2861921787261963, - "learning_rate": 4.996536705755591e-06, - "loss": 0.9658, - "step": 32 - }, - { - "epoch": 0.1783783783783784, - "grad_norm": 2.962470531463623, - "learning_rate": 4.996309719445687e-06, - "loss": 0.8349, - "step": 33 - }, - { - "epoch": 0.1837837837837838, - "grad_norm": 2.7694804668426514, - "learning_rate": 4.996075534426223e-06, - "loss": 0.8287, - "step": 34 - }, - { - "epoch": 0.1891891891891892, - "grad_norm": 3.405071258544922, - "learning_rate": 4.995834151372526e-06, - "loss": 1.1211, - "step": 35 - }, - { - "epoch": 0.1945945945945946, - "grad_norm": 2.8680710792541504, - "learning_rate": 4.995585570980685e-06, - "loss": 1.0841, - "step": 36 - }, - { - "epoch": 0.2, - "grad_norm": 3.341021776199341, - "learning_rate": 4.995329793967537e-06, - "loss": 0.6182, - "step": 37 - }, - { - "epoch": 0.20540540540540542, - "grad_norm": 3.0639379024505615, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.7647, - "step": 38 - }, - { - "epoch": 0.21081081081081082, - "grad_norm": 3.225759983062744, - "learning_rate": 4.994796653048457e-06, - "loss": 0.8691, - "step": 39 - }, - { - "epoch": 0.21621621621621623, - "grad_norm": 4.56926155090332, - "learning_rate": 4.994519290679965e-06, - "loss": 1.0404, - "step": 40 - }, - { - "epoch": 0.22162162162162163, - "grad_norm": 4.871571063995361, - "learning_rate": 4.994234734765043e-06, - "loss": 1.1877, - "step": 41 - }, - { - "epoch": 0.22702702702702704, - "grad_norm": 3.672215700149536, - "learning_rate": 4.993942986124278e-06, - "loss": 0.959, - "step": 42 - }, - { - "epoch": 0.23243243243243245, - "grad_norm": 3.184683322906494, - "learning_rate": 4.9936440455989975e-06, - "loss": 0.9249, - "step": 43 - }, - { - "epoch": 0.23783783783783785, - "grad_norm": 2.7092034816741943, - "learning_rate": 4.993337914051266e-06, - "loss": 0.6899, - "step": 44 - }, - { - "epoch": 0.24324324324324326, - "grad_norm": 3.153764486312866, - "learning_rate": 4.99302459236389e-06, - "loss": 0.9075, - "step": 45 - }, - { - "epoch": 0.24864864864864866, - "grad_norm": 3.3629748821258545, - "learning_rate": 4.992704081440407e-06, - "loss": 0.785, - "step": 46 - }, - { - "epoch": 0.25405405405405407, - "grad_norm": 4.478365898132324, - "learning_rate": 4.992376382205088e-06, - "loss": 1.008, - "step": 47 - }, - { - "epoch": 0.2594594594594595, - "grad_norm": 3.4001641273498535, - "learning_rate": 4.992041495602932e-06, - "loss": 0.7751, - "step": 48 - }, - { - "epoch": 0.2648648648648649, - "grad_norm": 2.522662878036499, - "learning_rate": 4.991699422599664e-06, - "loss": 0.9022, - "step": 49 - }, - { - "epoch": 0.2702702702702703, - "grad_norm": 2.764458179473877, - "learning_rate": 4.991350164181735e-06, - "loss": 0.8801, - "step": 50 - }, - { - "epoch": 0.2756756756756757, - "grad_norm": 2.814859628677368, - "learning_rate": 4.990993721356317e-06, - "loss": 0.7045, - "step": 51 - }, - { - "epoch": 0.2810810810810811, - "grad_norm": 2.441311836242676, - "learning_rate": 4.990630095151296e-06, - "loss": 0.7312, - "step": 52 - }, - { - "epoch": 0.2864864864864865, - "grad_norm": 2.4443013668060303, - "learning_rate": 4.9902592866152765e-06, - "loss": 0.9609, - "step": 53 - }, - { - "epoch": 0.2918918918918919, - "grad_norm": 2.2934701442718506, - "learning_rate": 4.989881296817575e-06, - "loss": 0.5753, - "step": 54 - }, - { - "epoch": 0.2972972972972973, - "grad_norm": 2.6286847591400146, - "learning_rate": 4.989496126848215e-06, - "loss": 0.5118, - "step": 55 - }, - { - "epoch": 0.3027027027027027, - "grad_norm": 3.6817069053649902, - "learning_rate": 4.989103777817928e-06, - "loss": 1.1261, - "step": 56 - }, - { - "epoch": 0.3081081081081081, - "grad_norm": 3.011197566986084, - "learning_rate": 4.988704250858145e-06, - "loss": 0.7823, - "step": 57 - }, - { - "epoch": 0.31351351351351353, - "grad_norm": 2.5490806102752686, - "learning_rate": 4.988297547121e-06, - "loss": 0.6019, - "step": 58 - }, - { - "epoch": 0.31891891891891894, - "grad_norm": 3.0803146362304688, - "learning_rate": 4.98788366777932e-06, - "loss": 0.825, - "step": 59 - }, - { - "epoch": 0.32432432432432434, - "grad_norm": 3.015730619430542, - "learning_rate": 4.987462614026625e-06, - "loss": 0.7667, - "step": 60 - }, - { - "epoch": 0.32972972972972975, - "grad_norm": 2.5371594429016113, - "learning_rate": 4.987034387077126e-06, - "loss": 0.8051, - "step": 61 - }, - { - "epoch": 0.33513513513513515, - "grad_norm": 2.6414010524749756, - "learning_rate": 4.986598988165718e-06, - "loss": 0.6895, - "step": 62 - }, - { - "epoch": 0.34054054054054056, - "grad_norm": 3.065131187438965, - "learning_rate": 4.9861564185479785e-06, - "loss": 0.9268, - "step": 63 - }, - { - "epoch": 0.34594594594594597, - "grad_norm": 2.5708694458007812, - "learning_rate": 4.985706679500163e-06, - "loss": 0.9854, - "step": 64 - }, - { - "epoch": 0.35135135135135137, - "grad_norm": 2.768915891647339, - "learning_rate": 4.9852497723192025e-06, - "loss": 0.8083, - "step": 65 - }, - { - "epoch": 0.3567567567567568, - "grad_norm": 2.567901849746704, - "learning_rate": 4.9847856983227e-06, - "loss": 0.9098, - "step": 66 - }, - { - "epoch": 0.3621621621621622, - "grad_norm": 2.5766549110412598, - "learning_rate": 4.984314458848923e-06, - "loss": 0.8881, - "step": 67 - }, - { - "epoch": 0.3675675675675676, - "grad_norm": 2.9778389930725098, - "learning_rate": 4.983836055256804e-06, - "loss": 0.9877, - "step": 68 - }, - { - "epoch": 0.372972972972973, - "grad_norm": 2.7225165367126465, - "learning_rate": 4.983350488925935e-06, - "loss": 0.8282, - "step": 69 - }, - { - "epoch": 0.3783783783783784, - "grad_norm": 2.702287197113037, - "learning_rate": 4.982857761256564e-06, - "loss": 1.1756, - "step": 70 - }, - { - "epoch": 0.3837837837837838, - "grad_norm": 2.9815568923950195, - "learning_rate": 4.982357873669589e-06, - "loss": 0.8114, - "step": 71 - }, - { - "epoch": 0.3891891891891892, - "grad_norm": 3.27150297164917, - "learning_rate": 4.981850827606556e-06, - "loss": 0.6763, - "step": 72 - }, - { - "epoch": 0.3945945945945946, - "grad_norm": 2.568423271179199, - "learning_rate": 4.981336624529655e-06, - "loss": 0.9372, - "step": 73 - }, - { - "epoch": 0.4, - "grad_norm": 2.621175527572632, - "learning_rate": 4.980815265921714e-06, - "loss": 1.0155, - "step": 74 - }, - { - "epoch": 0.40540540540540543, - "grad_norm": 2.62827205657959, - "learning_rate": 4.980286753286196e-06, - "loss": 0.949, - "step": 75 - }, - { - "epoch": 0.41081081081081083, - "grad_norm": 2.9462146759033203, - "learning_rate": 4.979751088147192e-06, - "loss": 1.0134, - "step": 76 - }, - { - "epoch": 0.41621621621621624, - "grad_norm": 2.814852714538574, - "learning_rate": 4.979208272049425e-06, - "loss": 0.9722, - "step": 77 - }, - { - "epoch": 0.42162162162162165, - "grad_norm": 4.177679538726807, - "learning_rate": 4.978658306558235e-06, - "loss": 1.2259, - "step": 78 - }, - { - "epoch": 0.42702702702702705, - "grad_norm": 2.813084125518799, - "learning_rate": 4.978101193259578e-06, - "loss": 0.834, - "step": 79 - }, - { - "epoch": 0.43243243243243246, - "grad_norm": 2.71824049949646, - "learning_rate": 4.977536933760025e-06, - "loss": 0.6151, - "step": 80 - }, - { - "epoch": 0.43783783783783786, - "grad_norm": 4.992153167724609, - "learning_rate": 4.976965529686755e-06, - "loss": 1.0475, - "step": 81 - }, - { - "epoch": 0.44324324324324327, - "grad_norm": 2.4810822010040283, - "learning_rate": 4.976386982687548e-06, - "loss": 0.8324, - "step": 82 - }, - { - "epoch": 0.4486486486486487, - "grad_norm": 4.509149074554443, - "learning_rate": 4.9758012944307845e-06, - "loss": 0.997, - "step": 83 - }, - { - "epoch": 0.4540540540540541, - "grad_norm": 3.114325761795044, - "learning_rate": 4.975208466605436e-06, - "loss": 1.2024, - "step": 84 - }, - { - "epoch": 0.4594594594594595, - "grad_norm": 3.297091007232666, - "learning_rate": 4.974608500921064e-06, - "loss": 0.9146, - "step": 85 - }, - { - "epoch": 0.4648648648648649, - "grad_norm": 2.824475049972534, - "learning_rate": 4.974001399107816e-06, - "loss": 0.7181, - "step": 86 - }, - { - "epoch": 0.4702702702702703, - "grad_norm": 20.262290954589844, - "learning_rate": 4.973387162916415e-06, - "loss": 0.8599, - "step": 87 - }, - { - "epoch": 0.4756756756756757, - "grad_norm": 4.015744686126709, - "learning_rate": 4.972765794118158e-06, - "loss": 0.6081, - "step": 88 - }, - { - "epoch": 0.4810810810810811, - "grad_norm": 2.8033058643341064, - "learning_rate": 4.9721372945049114e-06, - "loss": 0.8764, - "step": 89 - }, - { - "epoch": 0.4864864864864865, - "grad_norm": 5.271846294403076, - "learning_rate": 4.971501665889107e-06, - "loss": 0.8622, - "step": 90 - }, - { - "epoch": 0.4918918918918919, - "grad_norm": 2.557264804840088, - "learning_rate": 4.9708589101037306e-06, - "loss": 0.5523, - "step": 91 - }, - { - "epoch": 0.4972972972972973, - "grad_norm": 4.342173099517822, - "learning_rate": 4.970209029002325e-06, - "loss": 0.8922, - "step": 92 - }, - { - "epoch": 0.5027027027027027, - "grad_norm": 2.950364351272583, - "learning_rate": 4.969552024458977e-06, - "loss": 0.9455, - "step": 93 - }, - { - "epoch": 0.5081081081081081, - "grad_norm": 2.6453042030334473, - "learning_rate": 4.968887898368318e-06, - "loss": 0.8342, - "step": 94 - }, - { - "epoch": 0.5135135135135135, - "grad_norm": 3.486766815185547, - "learning_rate": 4.968216652645515e-06, - "loss": 0.8476, - "step": 95 - }, - { - "epoch": 0.518918918918919, - "grad_norm": 2.884152889251709, - "learning_rate": 4.967538289226268e-06, - "loss": 0.8879, - "step": 96 - }, - { - "epoch": 0.5243243243243243, - "grad_norm": 2.4130594730377197, - "learning_rate": 4.966852810066798e-06, - "loss": 0.7114, - "step": 97 - }, - { - "epoch": 0.5297297297297298, - "grad_norm": 3.182410955429077, - "learning_rate": 4.9661602171438524e-06, - "loss": 0.6757, - "step": 98 - }, - { - "epoch": 0.5351351351351351, - "grad_norm": 2.5027542114257812, - "learning_rate": 4.965460512454687e-06, - "loss": 0.8029, - "step": 99 - }, - { - "epoch": 0.5405405405405406, - "grad_norm": 2.3096024990081787, - "learning_rate": 4.964753698017071e-06, - "loss": 0.842, - "step": 100 - }, - { - "epoch": 0.5459459459459459, - "grad_norm": 2.875657081604004, - "learning_rate": 4.964039775869271e-06, - "loss": 0.6339, - "step": 101 - }, - { - "epoch": 0.5513513513513514, - "grad_norm": 2.505406141281128, - "learning_rate": 4.963318748070056e-06, - "loss": 0.7743, - "step": 102 - }, - { - "epoch": 0.5567567567567567, - "grad_norm": 3.552562713623047, - "learning_rate": 4.9625906166986815e-06, - "loss": 0.926, - "step": 103 - }, - { - "epoch": 0.5621621621621622, - "grad_norm": 2.717942476272583, - "learning_rate": 4.961855383854889e-06, - "loss": 0.7037, - "step": 104 - }, - { - "epoch": 0.5675675675675675, - "grad_norm": 2.5049386024475098, - "learning_rate": 4.961113051658901e-06, - "loss": 0.561, - "step": 105 - }, - { - "epoch": 0.572972972972973, - "grad_norm": 2.3112900257110596, - "learning_rate": 4.96036362225141e-06, - "loss": 0.7316, - "step": 106 - }, - { - "epoch": 0.5783783783783784, - "grad_norm": 2.470257520675659, - "learning_rate": 4.959607097793575e-06, - "loss": 0.6426, - "step": 107 - }, - { - "epoch": 0.5837837837837838, - "grad_norm": 3.8040788173675537, - "learning_rate": 4.9588434804670176e-06, - "loss": 1.0044, - "step": 108 - }, - { - "epoch": 0.5891891891891892, - "grad_norm": 3.143547296524048, - "learning_rate": 4.958072772473812e-06, - "loss": 0.9219, - "step": 109 - }, - { - "epoch": 0.5945945945945946, - "grad_norm": 3.5052590370178223, - "learning_rate": 4.9572949760364795e-06, - "loss": 0.6056, - "step": 110 - }, - { - "epoch": 0.6, - "grad_norm": 3.064009428024292, - "learning_rate": 4.9565100933979835e-06, - "loss": 0.6346, - "step": 111 - }, - { - "epoch": 0.6054054054054054, - "grad_norm": 2.694610595703125, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.9856, - "step": 112 - }, - { - "epoch": 0.6108108108108108, - "grad_norm": 2.5885775089263916, - "learning_rate": 4.954919078591521e-06, - "loss": 0.8669, - "step": 113 - }, - { - "epoch": 0.6162162162162163, - "grad_norm": 2.593609571456909, - "learning_rate": 4.954112951011628e-06, - "loss": 0.7201, - "step": 114 - }, - { - "epoch": 0.6216216216216216, - "grad_norm": 3.3045759201049805, - "learning_rate": 4.9532997464067065e-06, - "loss": 0.9095, - "step": 115 - }, - { - "epoch": 0.6270270270270271, - "grad_norm": 2.8144869804382324, - "learning_rate": 4.952479467121828e-06, - "loss": 1.0213, - "step": 116 - }, - { - "epoch": 0.6324324324324324, - "grad_norm": 2.5460312366485596, - "learning_rate": 4.951652115522463e-06, - "loss": 1.1154, - "step": 117 - }, - { - "epoch": 0.6378378378378379, - "grad_norm": 2.795137405395508, - "learning_rate": 4.950817693994481e-06, - "loss": 0.691, - "step": 118 - }, - { - "epoch": 0.6432432432432432, - "grad_norm": 2.4979195594787598, - "learning_rate": 4.949976204944135e-06, - "loss": 0.7224, - "step": 119 - }, - { - "epoch": 0.6486486486486487, - "grad_norm": 3.3131983280181885, - "learning_rate": 4.949127650798063e-06, - "loss": 0.9256, - "step": 120 - }, - { - "epoch": 0.654054054054054, - "grad_norm": 2.9060285091400146, - "learning_rate": 4.948272034003275e-06, - "loss": 0.6892, - "step": 121 - }, - { - "epoch": 0.6594594594594595, - "grad_norm": 3.695594549179077, - "learning_rate": 4.947409357027148e-06, - "loss": 0.5878, - "step": 122 - }, - { - "epoch": 0.6648648648648648, - "grad_norm": 3.1250460147857666, - "learning_rate": 4.9465396223574165e-06, - "loss": 0.9904, - "step": 123 - }, - { - "epoch": 0.6702702702702703, - "grad_norm": 4.024891376495361, - "learning_rate": 4.945662832502172e-06, - "loss": 1.1592, - "step": 124 - }, - { - "epoch": 0.6756756756756757, - "grad_norm": 2.6886494159698486, - "learning_rate": 4.944778989989847e-06, - "loss": 1.0041, - "step": 125 - }, - { - "epoch": 0.6810810810810811, - "grad_norm": 2.366912841796875, - "learning_rate": 4.943888097369216e-06, - "loss": 0.7045, - "step": 126 - }, - { - "epoch": 0.6864864864864865, - "grad_norm": 2.394932270050049, - "learning_rate": 4.942990157209381e-06, - "loss": 0.6685, - "step": 127 - }, - { - "epoch": 0.6918918918918919, - "grad_norm": 2.61933970451355, - "learning_rate": 4.9420851720997674e-06, - "loss": 0.8812, - "step": 128 - }, - { - "epoch": 0.6972972972972973, - "grad_norm": 2.7395646572113037, - "learning_rate": 4.94117314465012e-06, - "loss": 1.3014, - "step": 129 - }, - { - "epoch": 0.7027027027027027, - "grad_norm": 3.065484046936035, - "learning_rate": 4.940254077490487e-06, - "loss": 0.6978, - "step": 130 - }, - { - "epoch": 0.7081081081081081, - "grad_norm": 2.895038366317749, - "learning_rate": 4.939327973271222e-06, - "loss": 0.6249, - "step": 131 - }, - { - "epoch": 0.7135135135135136, - "grad_norm": 3.1773312091827393, - "learning_rate": 4.9383948346629665e-06, - "loss": 0.6423, - "step": 132 - }, - { - "epoch": 0.7189189189189189, - "grad_norm": 2.2378008365631104, - "learning_rate": 4.937454664356652e-06, - "loss": 0.7193, - "step": 133 - }, - { - "epoch": 0.7243243243243244, - "grad_norm": 2.5673701763153076, - "learning_rate": 4.9365074650634855e-06, - "loss": 0.7065, - "step": 134 - }, - { - "epoch": 0.7297297297297297, - "grad_norm": 2.7348387241363525, - "learning_rate": 4.9355532395149445e-06, - "loss": 1.0046, - "step": 135 - }, - { - "epoch": 0.7351351351351352, - "grad_norm": 2.391741991043091, - "learning_rate": 4.9345919904627655e-06, - "loss": 0.6771, - "step": 136 - }, - { - "epoch": 0.7405405405405405, - "grad_norm": 2.2096705436706543, - "learning_rate": 4.933623720678944e-06, - "loss": 0.6589, - "step": 137 - }, - { - "epoch": 0.745945945945946, - "grad_norm": 3.0840072631835938, - "learning_rate": 4.932648432955718e-06, - "loss": 0.8755, - "step": 138 - }, - { - "epoch": 0.7513513513513513, - "grad_norm": 2.4970428943634033, - "learning_rate": 4.931666130105564e-06, - "loss": 0.6685, - "step": 139 - }, - { - "epoch": 0.7567567567567568, - "grad_norm": 4.315455436706543, - "learning_rate": 4.930676814961189e-06, - "loss": 0.8101, - "step": 140 - }, - { - "epoch": 0.7621621621621621, - "grad_norm": 5.388065814971924, - "learning_rate": 4.92968049037552e-06, - "loss": 0.8193, - "step": 141 - }, - { - "epoch": 0.7675675675675676, - "grad_norm": 2.6107139587402344, - "learning_rate": 4.9286771592217005e-06, - "loss": 0.7852, - "step": 142 - }, - { - "epoch": 0.772972972972973, - "grad_norm": 3.936556577682495, - "learning_rate": 4.927666824393076e-06, - "loss": 1.0388, - "step": 143 - }, - { - "epoch": 0.7783783783783784, - "grad_norm": 2.74424409866333, - "learning_rate": 4.926649488803191e-06, - "loss": 0.8266, - "step": 144 - }, - { - "epoch": 0.7837837837837838, - "grad_norm": 2.8998451232910156, - "learning_rate": 4.925625155385776e-06, - "loss": 0.4895, - "step": 145 - }, - { - "epoch": 0.7891891891891892, - "grad_norm": 3.0631520748138428, - "learning_rate": 4.924593827094743e-06, - "loss": 0.8759, - "step": 146 - }, - { - "epoch": 0.7945945945945946, - "grad_norm": 3.233267307281494, - "learning_rate": 4.923555506904176e-06, - "loss": 0.701, - "step": 147 - }, - { - "epoch": 0.8, - "grad_norm": 2.87701416015625, - "learning_rate": 4.922510197808321e-06, - "loss": 1.1327, - "step": 148 - }, - { - "epoch": 0.8054054054054054, - "grad_norm": 3.650576114654541, - "learning_rate": 4.921457902821578e-06, - "loss": 0.7587, - "step": 149 - }, - { - "epoch": 0.8108108108108109, - "grad_norm": 3.232112407684326, - "learning_rate": 4.920398624978493e-06, - "loss": 1.2158, - "step": 150 - }, - { - "epoch": 0.8162162162162162, - "grad_norm": 2.468384027481079, - "learning_rate": 4.919332367333748e-06, - "loss": 0.6852, - "step": 151 - }, - { - "epoch": 0.8216216216216217, - "grad_norm": 2.5947415828704834, - "learning_rate": 4.918259132962154e-06, - "loss": 0.6611, - "step": 152 - }, - { - "epoch": 0.827027027027027, - "grad_norm": 3.0171427726745605, - "learning_rate": 4.917178924958638e-06, - "loss": 0.7327, - "step": 153 - }, - { - "epoch": 0.8324324324324325, - "grad_norm": 3.293184518814087, - "learning_rate": 4.916091746438243e-06, - "loss": 0.8528, - "step": 154 - }, - { - "epoch": 0.8378378378378378, - "grad_norm": 4.0570969581604, - "learning_rate": 4.9149976005361085e-06, - "loss": 0.9141, - "step": 155 - }, - { - "epoch": 0.8432432432432433, - "grad_norm": 2.8782784938812256, - "learning_rate": 4.913896490407467e-06, - "loss": 1.1132, - "step": 156 - }, - { - "epoch": 0.8486486486486486, - "grad_norm": 2.5671517848968506, - "learning_rate": 4.912788419227635e-06, - "loss": 0.7587, - "step": 157 - }, - { - "epoch": 0.8540540540540541, - "grad_norm": 2.9445390701293945, - "learning_rate": 4.911673390192002e-06, - "loss": 0.9227, - "step": 158 - }, - { - "epoch": 0.8594594594594595, - "grad_norm": 2.472595453262329, - "learning_rate": 4.910551406516023e-06, - "loss": 0.8154, - "step": 159 - }, - { - "epoch": 0.8648648648648649, - "grad_norm": 2.5233397483825684, - "learning_rate": 4.909422471435207e-06, - "loss": 0.9897, - "step": 160 - }, - { - "epoch": 0.8702702702702703, - "grad_norm": 3.3919546604156494, - "learning_rate": 4.90828658820511e-06, - "loss": 0.6162, - "step": 161 - }, - { - "epoch": 0.8756756756756757, - "grad_norm": 3.060908555984497, - "learning_rate": 4.907143760101325e-06, - "loss": 0.5734, - "step": 162 - }, - { - "epoch": 0.8810810810810811, - "grad_norm": 3.4584782123565674, - "learning_rate": 4.905993990419472e-06, - "loss": 0.8328, - "step": 163 - }, - { - "epoch": 0.8864864864864865, - "grad_norm": 2.936570644378662, - "learning_rate": 4.904837282475187e-06, - "loss": 0.6787, - "step": 164 - }, - { - "epoch": 0.8918918918918919, - "grad_norm": 2.564837694168091, - "learning_rate": 4.9036736396041165e-06, - "loss": 0.9658, - "step": 165 - }, - { - "epoch": 0.8972972972972973, - "grad_norm": 3.2509360313415527, - "learning_rate": 4.902503065161905e-06, - "loss": 0.7899, - "step": 166 - }, - { - "epoch": 0.9027027027027027, - "grad_norm": 2.9730329513549805, - "learning_rate": 4.901325562524185e-06, - "loss": 0.9476, - "step": 167 - }, - { - "epoch": 0.9081081081081082, - "grad_norm": 3.044980049133301, - "learning_rate": 4.900141135086569e-06, - "loss": 0.7589, - "step": 168 - }, - { - "epoch": 0.9135135135135135, - "grad_norm": 3.030585527420044, - "learning_rate": 4.898949786264638e-06, - "loss": 0.6724, - "step": 169 - }, - { - "epoch": 0.918918918918919, - "grad_norm": 2.249122142791748, - "learning_rate": 4.897751519493933e-06, - "loss": 0.6968, - "step": 170 - }, - { - "epoch": 0.9243243243243243, - "grad_norm": 2.9816982746124268, - "learning_rate": 4.896546338229945e-06, - "loss": 0.7984, - "step": 171 - }, - { - "epoch": 0.9297297297297298, - "grad_norm": 2.415736675262451, - "learning_rate": 4.8953342459481034e-06, - "loss": 0.6109, - "step": 172 - }, - { - "epoch": 0.9351351351351351, - "grad_norm": 2.740518808364868, - "learning_rate": 4.894115246143768e-06, - "loss": 0.8126, - "step": 173 - }, - { - "epoch": 0.9405405405405406, - "grad_norm": 2.7610201835632324, - "learning_rate": 4.892889342332218e-06, - "loss": 0.6862, - "step": 174 - }, - { - "epoch": 0.9459459459459459, - "grad_norm": 3.057025194168091, - "learning_rate": 4.891656538048642e-06, - "loss": 0.9895, - "step": 175 - }, - { - "epoch": 0.9513513513513514, - "grad_norm": 2.569751262664795, - "learning_rate": 4.890416836848128e-06, - "loss": 0.8481, - "step": 176 - }, - { - "epoch": 0.9567567567567568, - "grad_norm": 2.4443397521972656, - "learning_rate": 4.889170242305652e-06, - "loss": 0.6478, - "step": 177 - }, - { - "epoch": 0.9621621621621622, - "grad_norm": 2.5009846687316895, - "learning_rate": 4.887916758016069e-06, - "loss": 0.9714, - "step": 178 - }, - { - "epoch": 0.9675675675675676, - "grad_norm": 3.101975202560425, - "learning_rate": 4.886656387594104e-06, - "loss": 1.1264, - "step": 179 - }, - { - "epoch": 0.972972972972973, - "grad_norm": 2.6144704818725586, - "learning_rate": 4.885389134674338e-06, - "loss": 0.7664, - "step": 180 - }, - { - "epoch": 0.9783783783783784, - "grad_norm": 2.5834381580352783, - "learning_rate": 4.884115002911197e-06, - "loss": 0.6131, - "step": 181 - }, - { - "epoch": 0.9837837837837838, - "grad_norm": 2.5378055572509766, - "learning_rate": 4.88283399597895e-06, - "loss": 0.8733, - "step": 182 - }, - { - "epoch": 0.9891891891891892, - "grad_norm": 2.4095377922058105, - "learning_rate": 4.881546117571686e-06, - "loss": 0.643, - "step": 183 - }, - { - "epoch": 0.9945945945945946, - "grad_norm": 2.9554507732391357, - "learning_rate": 4.8802513714033135e-06, - "loss": 0.7287, - "step": 184 - }, - { - "epoch": 1.0, - "grad_norm": 2.8279213905334473, - "learning_rate": 4.878949761207545e-06, - "loss": 0.9927, - "step": 185 - }, - { - "epoch": 1.0054054054054054, - "grad_norm": 2.9361412525177, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.66, - "step": 186 - }, - { - "epoch": 1.0108108108108107, - "grad_norm": 3.392244338989258, - "learning_rate": 4.876325963767623e-06, - "loss": 0.594, - "step": 187 - }, - { - "epoch": 1.0162162162162163, - "grad_norm": 2.6276044845581055, - "learning_rate": 4.875003784089822e-06, - "loss": 0.5825, - "step": 188 - }, - { - "epoch": 1.0216216216216216, - "grad_norm": 2.2875545024871826, - "learning_rate": 4.873674755517305e-06, - "loss": 0.6594, - "step": 189 - }, - { - "epoch": 1.027027027027027, - "grad_norm": 2.8086795806884766, - "learning_rate": 4.872338881882645e-06, - "loss": 0.7536, - "step": 190 - }, - { - "epoch": 1.0324324324324325, - "grad_norm": 2.3685200214385986, - "learning_rate": 4.870996167038154e-06, - "loss": 0.4849, - "step": 191 - }, - { - "epoch": 1.037837837837838, - "grad_norm": 3.0264766216278076, - "learning_rate": 4.869646614855877e-06, - "loss": 0.3771, - "step": 192 - }, - { - "epoch": 1.0432432432432432, - "grad_norm": 4.335122108459473, - "learning_rate": 4.868290229227567e-06, - "loss": 0.8545, - "step": 193 - }, - { - "epoch": 1.0486486486486486, - "grad_norm": 3.442172050476074, - "learning_rate": 4.866927014064692e-06, - "loss": 0.3698, - "step": 194 - }, - { - "epoch": 1.054054054054054, - "grad_norm": 3.326539993286133, - "learning_rate": 4.86555697329841e-06, - "loss": 0.8468, - "step": 195 - }, - { - "epoch": 1.0594594594594595, - "grad_norm": 3.0372447967529297, - "learning_rate": 4.864180110879562e-06, - "loss": 0.8232, - "step": 196 - }, - { - "epoch": 1.0648648648648649, - "grad_norm": 2.955343008041382, - "learning_rate": 4.862796430778663e-06, - "loss": 0.4097, - "step": 197 - }, - { - "epoch": 1.0702702702702702, - "grad_norm": 2.4095399379730225, - "learning_rate": 4.861405936985889e-06, - "loss": 0.6746, - "step": 198 - }, - { - "epoch": 1.0756756756756758, - "grad_norm": 2.763500452041626, - "learning_rate": 4.860008633511059e-06, - "loss": 0.6605, - "step": 199 - }, - { - "epoch": 1.0810810810810811, - "grad_norm": 2.6751155853271484, - "learning_rate": 4.8586045243836384e-06, - "loss": 0.471, - "step": 200 - }, - { - "epoch": 1.0864864864864865, - "grad_norm": 3.3507862091064453, - "learning_rate": 4.857193613652711e-06, - "loss": 0.7665, - "step": 201 - }, - { - "epoch": 1.0918918918918918, - "grad_norm": 3.3064827919006348, - "learning_rate": 4.8557759053869775e-06, - "loss": 0.6436, - "step": 202 - }, - { - "epoch": 1.0972972972972972, - "grad_norm": 2.571828603744507, - "learning_rate": 4.854351403674741e-06, - "loss": 0.4642, - "step": 203 - }, - { - "epoch": 1.1027027027027028, - "grad_norm": 2.883220911026001, - "learning_rate": 4.852920112623895e-06, - "loss": 0.5737, - "step": 204 - }, - { - "epoch": 1.1081081081081081, - "grad_norm": 3.026144027709961, - "learning_rate": 4.851482036361912e-06, - "loss": 0.7302, - "step": 205 - }, - { - "epoch": 1.1135135135135135, - "grad_norm": 2.6689612865448, - "learning_rate": 4.850037179035829e-06, - "loss": 0.5229, - "step": 206 - }, - { - "epoch": 1.118918918918919, - "grad_norm": 2.4019956588745117, - "learning_rate": 4.8485855448122425e-06, - "loss": 0.5529, - "step": 207 - }, - { - "epoch": 1.1243243243243244, - "grad_norm": 2.3546230792999268, - "learning_rate": 4.847127137877286e-06, - "loss": 0.3635, - "step": 208 - }, - { - "epoch": 1.1297297297297297, - "grad_norm": 2.999096393585205, - "learning_rate": 4.8456619624366285e-06, - "loss": 0.8149, - "step": 209 - }, - { - "epoch": 1.135135135135135, - "grad_norm": 10.072900772094727, - "learning_rate": 4.844190022715456e-06, - "loss": 0.8333, - "step": 210 - }, - { - "epoch": 1.1405405405405404, - "grad_norm": 2.222123384475708, - "learning_rate": 4.84271132295846e-06, - "loss": 0.3717, - "step": 211 - }, - { - "epoch": 1.145945945945946, - "grad_norm": 2.8751113414764404, - "learning_rate": 4.841225867429826e-06, - "loss": 0.5994, - "step": 212 - }, - { - "epoch": 1.1513513513513514, - "grad_norm": 2.9580111503601074, - "learning_rate": 4.839733660413224e-06, - "loss": 0.8382, - "step": 213 - }, - { - "epoch": 1.1567567567567567, - "grad_norm": 4.628892421722412, - "learning_rate": 4.838234706211792e-06, - "loss": 0.818, - "step": 214 - }, - { - "epoch": 1.1621621621621623, - "grad_norm": 2.5103509426116943, - "learning_rate": 4.836729009148124e-06, - "loss": 0.4267, - "step": 215 - }, - { - "epoch": 1.1675675675675676, - "grad_norm": 2.6093738079071045, - "learning_rate": 4.835216573564261e-06, - "loss": 0.3472, - "step": 216 - }, - { - "epoch": 1.172972972972973, - "grad_norm": 3.0792338848114014, - "learning_rate": 4.833697403821672e-06, - "loss": 0.6323, - "step": 217 - }, - { - "epoch": 1.1783783783783783, - "grad_norm": 2.845163345336914, - "learning_rate": 4.8321715043012516e-06, - "loss": 0.6831, - "step": 218 - }, - { - "epoch": 1.1837837837837837, - "grad_norm": 3.0433948040008545, - "learning_rate": 4.830638879403296e-06, - "loss": 0.3682, - "step": 219 - }, - { - "epoch": 1.1891891891891893, - "grad_norm": 2.6533594131469727, - "learning_rate": 4.8290995335475e-06, - "loss": 0.4154, - "step": 220 - }, - { - "epoch": 1.1945945945945946, - "grad_norm": 2.9271352291107178, - "learning_rate": 4.827553471172935e-06, - "loss": 0.3991, - "step": 221 - }, - { - "epoch": 1.2, - "grad_norm": 2.9243528842926025, - "learning_rate": 4.826000696738045e-06, - "loss": 0.4538, - "step": 222 - }, - { - "epoch": 1.2054054054054055, - "grad_norm": 2.537332534790039, - "learning_rate": 4.824441214720629e-06, - "loss": 0.7692, - "step": 223 - }, - { - "epoch": 1.2108108108108109, - "grad_norm": 3.9193246364593506, - "learning_rate": 4.8228750296178275e-06, - "loss": 0.6038, - "step": 224 - }, - { - "epoch": 1.2162162162162162, - "grad_norm": 2.6646728515625, - "learning_rate": 4.821302145946113e-06, - "loss": 0.4147, - "step": 225 - }, - { - "epoch": 1.2216216216216216, - "grad_norm": 2.6519482135772705, - "learning_rate": 4.819722568241274e-06, - "loss": 0.5398, - "step": 226 - }, - { - "epoch": 1.227027027027027, - "grad_norm": 2.2018048763275146, - "learning_rate": 4.818136301058401e-06, - "loss": 0.3864, - "step": 227 - }, - { - "epoch": 1.2324324324324325, - "grad_norm": 2.5660712718963623, - "learning_rate": 4.816543348971879e-06, - "loss": 0.5712, - "step": 228 - }, - { - "epoch": 1.2378378378378379, - "grad_norm": 3.237663745880127, - "learning_rate": 4.814943716575368e-06, - "loss": 0.662, - "step": 229 - }, - { - "epoch": 1.2432432432432432, - "grad_norm": 2.5570430755615234, - "learning_rate": 4.813337408481793e-06, - "loss": 0.8661, - "step": 230 - }, - { - "epoch": 1.2486486486486488, - "grad_norm": 2.9231269359588623, - "learning_rate": 4.811724429323329e-06, - "loss": 0.9218, - "step": 231 - }, - { - "epoch": 1.2540540540540541, - "grad_norm": 3.637084722518921, - "learning_rate": 4.810104783751389e-06, - "loss": 0.5597, - "step": 232 - }, - { - "epoch": 1.2594594594594595, - "grad_norm": 3.0218842029571533, - "learning_rate": 4.8084784764366125e-06, - "loss": 0.4786, - "step": 233 - }, - { - "epoch": 1.2648648648648648, - "grad_norm": 2.770214080810547, - "learning_rate": 4.806845512068846e-06, - "loss": 0.5219, - "step": 234 - }, - { - "epoch": 1.2702702702702702, - "grad_norm": 3.093053102493286, - "learning_rate": 4.805205895357137e-06, - "loss": 0.643, - "step": 235 - }, - { - "epoch": 1.2756756756756757, - "grad_norm": 2.6373348236083984, - "learning_rate": 4.803559631029713e-06, - "loss": 0.5858, - "step": 236 - }, - { - "epoch": 1.281081081081081, - "grad_norm": 2.452030897140503, - "learning_rate": 4.801906723833973e-06, - "loss": 0.4185, - "step": 237 - }, - { - "epoch": 1.2864864864864864, - "grad_norm": 2.72564697265625, - "learning_rate": 4.8002471785364734e-06, - "loss": 0.4917, - "step": 238 - }, - { - "epoch": 1.291891891891892, - "grad_norm": 3.0389158725738525, - "learning_rate": 4.798580999922913e-06, - "loss": 0.645, - "step": 239 - }, - { - "epoch": 1.2972972972972974, - "grad_norm": 3.7002289295196533, - "learning_rate": 4.796908192798117e-06, - "loss": 0.5378, - "step": 240 - }, - { - "epoch": 1.3027027027027027, - "grad_norm": 2.1876111030578613, - "learning_rate": 4.7952287619860276e-06, - "loss": 0.5197, - "step": 241 - }, - { - "epoch": 1.308108108108108, - "grad_norm": 3.903337240219116, - "learning_rate": 4.793542712329689e-06, - "loss": 1.0226, - "step": 242 - }, - { - "epoch": 1.3135135135135134, - "grad_norm": 2.3623552322387695, - "learning_rate": 4.791850048691228e-06, - "loss": 0.5502, - "step": 243 - }, - { - "epoch": 1.318918918918919, - "grad_norm": 3.0669031143188477, - "learning_rate": 4.79015077595185e-06, - "loss": 0.6976, - "step": 244 - }, - { - "epoch": 1.3243243243243243, - "grad_norm": 3.1480472087860107, - "learning_rate": 4.788444899011816e-06, - "loss": 0.4795, - "step": 245 - }, - { - "epoch": 1.3297297297297297, - "grad_norm": 3.7051920890808105, - "learning_rate": 4.786732422790432e-06, - "loss": 0.6526, - "step": 246 - }, - { - "epoch": 1.3351351351351353, - "grad_norm": 3.4358389377593994, - "learning_rate": 4.785013352226036e-06, - "loss": 0.5551, - "step": 247 - }, - { - "epoch": 1.3405405405405406, - "grad_norm": 2.3789355754852295, - "learning_rate": 4.7832876922759805e-06, - "loss": 0.3151, - "step": 248 - }, - { - "epoch": 1.345945945945946, - "grad_norm": 2.4843716621398926, - "learning_rate": 4.781555447916622e-06, - "loss": 0.6713, - "step": 249 - }, - { - "epoch": 1.3513513513513513, - "grad_norm": 3.0176303386688232, - "learning_rate": 4.779816624143302e-06, - "loss": 0.437, - "step": 250 - }, - { - "epoch": 1.3567567567567567, - "grad_norm": 2.868350028991699, - "learning_rate": 4.77807122597034e-06, - "loss": 0.7632, - "step": 251 - }, - { - "epoch": 1.3621621621621622, - "grad_norm": 2.4629738330841064, - "learning_rate": 4.776319258431009e-06, - "loss": 0.4894, - "step": 252 - }, - { - "epoch": 1.3675675675675676, - "grad_norm": 2.798297882080078, - "learning_rate": 4.77456072657753e-06, - "loss": 0.4456, - "step": 253 - }, - { - "epoch": 1.372972972972973, - "grad_norm": 3.2977547645568848, - "learning_rate": 4.772795635481053e-06, - "loss": 0.5381, - "step": 254 - }, - { - "epoch": 1.3783783783783785, - "grad_norm": 4.1061906814575195, - "learning_rate": 4.77102399023164e-06, - "loss": 1.0302, - "step": 255 - }, - { - "epoch": 1.3837837837837839, - "grad_norm": 3.943284511566162, - "learning_rate": 4.769245795938261e-06, - "loss": 0.4875, - "step": 256 - }, - { - "epoch": 1.3891891891891892, - "grad_norm": 2.6420533657073975, - "learning_rate": 4.767461057728763e-06, - "loss": 0.4923, - "step": 257 - }, - { - "epoch": 1.3945945945945946, - "grad_norm": 3.3152263164520264, - "learning_rate": 4.76566978074987e-06, - "loss": 0.6699, - "step": 258 - }, - { - "epoch": 1.4, - "grad_norm": 2.6928882598876953, - "learning_rate": 4.7638719701671586e-06, - "loss": 0.6117, - "step": 259 - }, - { - "epoch": 1.4054054054054055, - "grad_norm": 2.706597328186035, - "learning_rate": 4.762067631165049e-06, - "loss": 0.8534, - "step": 260 - }, - { - "epoch": 1.4108108108108108, - "grad_norm": 2.9912848472595215, - "learning_rate": 4.760256768946787e-06, - "loss": 0.5057, - "step": 261 - }, - { - "epoch": 1.4162162162162162, - "grad_norm": 2.7098443508148193, - "learning_rate": 4.758439388734429e-06, - "loss": 0.7286, - "step": 262 - }, - { - "epoch": 1.4216216216216218, - "grad_norm": 3.1288092136383057, - "learning_rate": 4.7566154957688276e-06, - "loss": 0.9827, - "step": 263 - }, - { - "epoch": 1.427027027027027, - "grad_norm": 3.0505919456481934, - "learning_rate": 4.754785095309617e-06, - "loss": 0.7042, - "step": 264 - }, - { - "epoch": 1.4324324324324325, - "grad_norm": 2.6800339221954346, - "learning_rate": 4.752948192635199e-06, - "loss": 0.5179, - "step": 265 - }, - { - "epoch": 1.4378378378378378, - "grad_norm": 2.2246861457824707, - "learning_rate": 4.751104793042722e-06, - "loss": 0.8527, - "step": 266 - }, - { - "epoch": 1.4432432432432432, - "grad_norm": 2.4242751598358154, - "learning_rate": 4.7492549018480725e-06, - "loss": 0.5627, - "step": 267 - }, - { - "epoch": 1.4486486486486487, - "grad_norm": 2.763244152069092, - "learning_rate": 4.747398524385858e-06, - "loss": 0.8981, - "step": 268 - }, - { - "epoch": 1.454054054054054, - "grad_norm": 2.856595993041992, - "learning_rate": 4.745535666009389e-06, - "loss": 0.5455, - "step": 269 - }, - { - "epoch": 1.4594594594594594, - "grad_norm": 2.4168624877929688, - "learning_rate": 4.743666332090664e-06, - "loss": 0.4348, - "step": 270 - }, - { - "epoch": 1.464864864864865, - "grad_norm": 2.5408060550689697, - "learning_rate": 4.74179052802036e-06, - "loss": 0.5524, - "step": 271 - }, - { - "epoch": 1.4702702702702704, - "grad_norm": 2.6216673851013184, - "learning_rate": 4.739908259207807e-06, - "loss": 0.7469, - "step": 272 - }, - { - "epoch": 1.4756756756756757, - "grad_norm": 5.397300720214844, - "learning_rate": 4.738019531080981e-06, - "loss": 0.7216, - "step": 273 - }, - { - "epoch": 1.481081081081081, - "grad_norm": 3.3481080532073975, - "learning_rate": 4.7361243490864825e-06, - "loss": 0.7527, - "step": 274 - }, - { - "epoch": 1.4864864864864864, - "grad_norm": 2.7943873405456543, - "learning_rate": 4.734222718689527e-06, - "loss": 0.7437, - "step": 275 - }, - { - "epoch": 1.491891891891892, - "grad_norm": 2.206890344619751, - "learning_rate": 4.732314645373922e-06, - "loss": 0.5187, - "step": 276 - }, - { - "epoch": 1.4972972972972973, - "grad_norm": 2.76442813873291, - "learning_rate": 4.730400134642055e-06, - "loss": 0.7186, - "step": 277 - }, - { - "epoch": 1.5027027027027027, - "grad_norm": 3.4754087924957275, - "learning_rate": 4.728479192014879e-06, - "loss": 0.9655, - "step": 278 - }, - { - "epoch": 1.5081081081081082, - "grad_norm": 2.923779249191284, - "learning_rate": 4.726551823031895e-06, - "loss": 0.6251, - "step": 279 - }, - { - "epoch": 1.5135135135135136, - "grad_norm": 3.1142773628234863, - "learning_rate": 4.7246180332511335e-06, - "loss": 0.4805, - "step": 280 - }, - { - "epoch": 1.518918918918919, - "grad_norm": 2.3477070331573486, - "learning_rate": 4.722677828249142e-06, - "loss": 1.0939, - "step": 281 - }, - { - "epoch": 1.5243243243243243, - "grad_norm": 2.8418569564819336, - "learning_rate": 4.720731213620972e-06, - "loss": 0.9485, - "step": 282 - }, - { - "epoch": 1.5297297297297296, - "grad_norm": 2.462710380554199, - "learning_rate": 4.718778194980152e-06, - "loss": 0.5805, - "step": 283 - }, - { - "epoch": 1.535135135135135, - "grad_norm": 3.2379209995269775, - "learning_rate": 4.7168187779586805e-06, - "loss": 0.77, - "step": 284 - }, - { - "epoch": 1.5405405405405406, - "grad_norm": 3.0701661109924316, - "learning_rate": 4.71485296820701e-06, - "loss": 0.5932, - "step": 285 - }, - { - "epoch": 1.545945945945946, - "grad_norm": 4.099547386169434, - "learning_rate": 4.7128807713940245e-06, - "loss": 0.6296, - "step": 286 - }, - { - "epoch": 1.5513513513513515, - "grad_norm": 2.5529167652130127, - "learning_rate": 4.710902193207028e-06, - "loss": 0.6201, - "step": 287 - }, - { - "epoch": 1.5567567567567568, - "grad_norm": 2.794926881790161, - "learning_rate": 4.708917239351727e-06, - "loss": 0.5682, - "step": 288 - }, - { - "epoch": 1.5621621621621622, - "grad_norm": 3.2522501945495605, - "learning_rate": 4.706925915552214e-06, - "loss": 0.8877, - "step": 289 - }, - { - "epoch": 1.5675675675675675, - "grad_norm": 2.811847448348999, - "learning_rate": 4.704928227550949e-06, - "loss": 0.6521, - "step": 290 - }, - { - "epoch": 1.572972972972973, - "grad_norm": 2.7060673236846924, - "learning_rate": 4.702924181108745e-06, - "loss": 0.4929, - "step": 291 - }, - { - "epoch": 1.5783783783783782, - "grad_norm": 2.5009031295776367, - "learning_rate": 4.700913782004755e-06, - "loss": 0.4515, - "step": 292 - }, - { - "epoch": 1.5837837837837838, - "grad_norm": 2.6722700595855713, - "learning_rate": 4.698897036036446e-06, - "loss": 0.5477, - "step": 293 - }, - { - "epoch": 1.5891891891891892, - "grad_norm": 3.3333957195281982, - "learning_rate": 4.696873949019591e-06, - "loss": 0.9589, - "step": 294 - }, - { - "epoch": 1.5945945945945947, - "grad_norm": 2.4862897396087646, - "learning_rate": 4.694844526788248e-06, - "loss": 0.4425, - "step": 295 - }, - { - "epoch": 1.6, - "grad_norm": 2.78708553314209, - "learning_rate": 4.692808775194745e-06, - "loss": 0.4899, - "step": 296 - }, - { - "epoch": 1.6054054054054054, - "grad_norm": 2.9121289253234863, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4884, - "step": 297 - }, - { - "epoch": 1.6108108108108108, - "grad_norm": 4.692054271697998, - "learning_rate": 4.688718307421807e-06, - "loss": 0.8977, - "step": 298 - }, - { - "epoch": 1.6162162162162161, - "grad_norm": 3.1290926933288574, - "learning_rate": 4.686663603038222e-06, - "loss": 0.6833, - "step": 299 - }, - { - "epoch": 1.6216216216216215, - "grad_norm": 3.5091123580932617, - "learning_rate": 4.6846025928841365e-06, - "loss": 0.9141, - "step": 300 - }, - { - "epoch": 1.627027027027027, - "grad_norm": 2.5466184616088867, - "learning_rate": 4.6825352829029705e-06, - "loss": 0.5121, - "step": 301 - }, - { - "epoch": 1.6324324324324324, - "grad_norm": 2.7833092212677, - "learning_rate": 4.68046167905631e-06, - "loss": 0.5399, - "step": 302 - }, - { - "epoch": 1.637837837837838, - "grad_norm": 3.05135440826416, - "learning_rate": 4.678381787323889e-06, - "loss": 0.7921, - "step": 303 - }, - { - "epoch": 1.6432432432432433, - "grad_norm": 2.2391726970672607, - "learning_rate": 4.676295613703577e-06, - "loss": 0.7178, - "step": 304 - }, - { - "epoch": 1.6486486486486487, - "grad_norm": 2.3654022216796875, - "learning_rate": 4.674203164211357e-06, - "loss": 0.7162, - "step": 305 - }, - { - "epoch": 1.654054054054054, - "grad_norm": 2.436009645462036, - "learning_rate": 4.67210444488131e-06, - "loss": 0.6539, - "step": 306 - }, - { - "epoch": 1.6594594594594594, - "grad_norm": 2.6034209728240967, - "learning_rate": 4.669999461765599e-06, - "loss": 0.7214, - "step": 307 - }, - { - "epoch": 1.6648648648648647, - "grad_norm": 2.804229497909546, - "learning_rate": 4.6678882209344474e-06, - "loss": 0.7451, - "step": 308 - }, - { - "epoch": 1.6702702702702703, - "grad_norm": 2.6239655017852783, - "learning_rate": 4.665770728476127e-06, - "loss": 0.6464, - "step": 309 - }, - { - "epoch": 1.6756756756756757, - "grad_norm": 2.9320099353790283, - "learning_rate": 4.663646990496939e-06, - "loss": 0.6669, - "step": 310 - }, - { - "epoch": 1.6810810810810812, - "grad_norm": 3.09713077545166, - "learning_rate": 4.661517013121189e-06, - "loss": 0.8972, - "step": 311 - }, - { - "epoch": 1.6864864864864866, - "grad_norm": 3.6576132774353027, - "learning_rate": 4.659380802491181e-06, - "loss": 0.6286, - "step": 312 - }, - { - "epoch": 1.691891891891892, - "grad_norm": 2.9320433139801025, - "learning_rate": 4.6572383647671915e-06, - "loss": 0.3631, - "step": 313 - }, - { - "epoch": 1.6972972972972973, - "grad_norm": 3.399357557296753, - "learning_rate": 4.655089706127457e-06, - "loss": 0.5682, - "step": 314 - }, - { - "epoch": 1.7027027027027026, - "grad_norm": 2.7667412757873535, - "learning_rate": 4.652934832768148e-06, - "loss": 0.5457, - "step": 315 - }, - { - "epoch": 1.708108108108108, - "grad_norm": 2.3023321628570557, - "learning_rate": 4.650773750903363e-06, - "loss": 0.6601, - "step": 316 - }, - { - "epoch": 1.7135135135135136, - "grad_norm": 2.6584670543670654, - "learning_rate": 4.6486064667651005e-06, - "loss": 0.5882, - "step": 317 - }, - { - "epoch": 1.718918918918919, - "grad_norm": 5.528168678283691, - "learning_rate": 4.646432986603245e-06, - "loss": 0.7628, - "step": 318 - }, - { - "epoch": 1.7243243243243245, - "grad_norm": 3.054884195327759, - "learning_rate": 4.644253316685552e-06, - "loss": 0.6877, - "step": 319 - }, - { - "epoch": 1.7297297297297298, - "grad_norm": 3.2672388553619385, - "learning_rate": 4.6420674632976205e-06, - "loss": 0.7026, - "step": 320 - }, - { - "epoch": 1.7351351351351352, - "grad_norm": 3.109384536743164, - "learning_rate": 4.639875432742886e-06, - "loss": 0.5236, - "step": 321 - }, - { - "epoch": 1.7405405405405405, - "grad_norm": 3.3593883514404297, - "learning_rate": 4.6376772313425975e-06, - "loss": 0.6463, - "step": 322 - }, - { - "epoch": 1.7459459459459459, - "grad_norm": 2.6352698802948, - "learning_rate": 4.635472865435795e-06, - "loss": 0.6903, - "step": 323 - }, - { - "epoch": 1.7513513513513512, - "grad_norm": 2.751690149307251, - "learning_rate": 4.6332623413792995e-06, - "loss": 0.7342, - "step": 324 - }, - { - "epoch": 1.7567567567567568, - "grad_norm": 2.670915126800537, - "learning_rate": 4.6310456655476874e-06, - "loss": 0.4302, - "step": 325 - }, - { - "epoch": 1.7621621621621621, - "grad_norm": 2.7648138999938965, - "learning_rate": 4.6288228443332786e-06, - "loss": 0.5108, - "step": 326 - }, - { - "epoch": 1.7675675675675677, - "grad_norm": 2.7451536655426025, - "learning_rate": 4.626593884146111e-06, - "loss": 0.7646, - "step": 327 - }, - { - "epoch": 1.772972972972973, - "grad_norm": 2.4656403064727783, - "learning_rate": 4.624358791413928e-06, - "loss": 0.5529, - "step": 328 - }, - { - "epoch": 1.7783783783783784, - "grad_norm": 2.5987517833709717, - "learning_rate": 4.622117572582159e-06, - "loss": 0.609, - "step": 329 - }, - { - "epoch": 1.7837837837837838, - "grad_norm": 3.3843371868133545, - "learning_rate": 4.619870234113894e-06, - "loss": 0.9146, - "step": 330 - }, - { - "epoch": 1.7891891891891891, - "grad_norm": 2.3542068004608154, - "learning_rate": 4.617616782489878e-06, - "loss": 0.6887, - "step": 331 - }, - { - "epoch": 1.7945945945945945, - "grad_norm": 2.2049715518951416, - "learning_rate": 4.615357224208477e-06, - "loss": 0.505, - "step": 332 - }, - { - "epoch": 1.8, - "grad_norm": 2.453920364379883, - "learning_rate": 4.613091565785674e-06, - "loss": 0.8384, - "step": 333 - }, - { - "epoch": 1.8054054054054054, - "grad_norm": 2.5751583576202393, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5512, - "step": 334 - }, - { - "epoch": 1.810810810810811, - "grad_norm": 2.524075984954834, - "learning_rate": 4.608541974667714e-06, - "loss": 0.4877, - "step": 335 - }, - { - "epoch": 1.8162162162162163, - "grad_norm": 2.2856955528259277, - "learning_rate": 4.606258055092397e-06, - "loss": 0.5583, - "step": 336 - }, - { - "epoch": 1.8216216216216217, - "grad_norm": 2.2773683071136475, - "learning_rate": 4.603968061615321e-06, - "loss": 0.5421, - "step": 337 - }, - { - "epoch": 1.827027027027027, - "grad_norm": 4.085512161254883, - "learning_rate": 4.601672000840231e-06, - "loss": 0.942, - "step": 338 - }, - { - "epoch": 1.8324324324324324, - "grad_norm": 2.3710968494415283, - "learning_rate": 4.5993698793883715e-06, - "loss": 0.3773, - "step": 339 - }, - { - "epoch": 1.8378378378378377, - "grad_norm": 2.745534658432007, - "learning_rate": 4.597061703898462e-06, - "loss": 0.9694, - "step": 340 - }, - { - "epoch": 1.8432432432432433, - "grad_norm": 2.463207244873047, - "learning_rate": 4.594747481026685e-06, - "loss": 0.4667, - "step": 341 - }, - { - "epoch": 1.8486486486486486, - "grad_norm": 2.7216601371765137, - "learning_rate": 4.592427217446656e-06, - "loss": 0.4267, - "step": 342 - }, - { - "epoch": 1.8540540540540542, - "grad_norm": 2.545664072036743, - "learning_rate": 4.590100919849413e-06, - "loss": 0.9245, - "step": 343 - }, - { - "epoch": 1.8594594594594596, - "grad_norm": 3.692840337753296, - "learning_rate": 4.587768594943396e-06, - "loss": 0.7502, - "step": 344 - }, - { - "epoch": 1.864864864864865, - "grad_norm": 2.993229627609253, - "learning_rate": 4.585430249454426e-06, - "loss": 0.4689, - "step": 345 - }, - { - "epoch": 1.8702702702702703, - "grad_norm": 2.162867546081543, - "learning_rate": 4.583085890125682e-06, - "loss": 0.6188, - "step": 346 - }, - { - "epoch": 1.8756756756756756, - "grad_norm": 2.2169792652130127, - "learning_rate": 4.5807355237176896e-06, - "loss": 0.6352, - "step": 347 - }, - { - "epoch": 1.881081081081081, - "grad_norm": 3.978985548019409, - "learning_rate": 4.578379157008296e-06, - "loss": 0.464, - "step": 348 - }, - { - "epoch": 1.8864864864864865, - "grad_norm": 2.236682653427124, - "learning_rate": 4.57601679679265e-06, - "loss": 0.5943, - "step": 349 - }, - { - "epoch": 1.8918918918918919, - "grad_norm": 2.528754472732544, - "learning_rate": 4.573648449883188e-06, - "loss": 0.6949, - "step": 350 - }, - { - "epoch": 1.8972972972972975, - "grad_norm": 2.7673721313476562, - "learning_rate": 4.571274123109606e-06, - "loss": 0.4333, - "step": 351 - }, - { - "epoch": 1.9027027027027028, - "grad_norm": 2.698012351989746, - "learning_rate": 4.568893823318847e-06, - "loss": 0.6796, - "step": 352 - }, - { - "epoch": 1.9081081081081082, - "grad_norm": 2.9640560150146484, - "learning_rate": 4.566507557375077e-06, - "loss": 0.6139, - "step": 353 - }, - { - "epoch": 1.9135135135135135, - "grad_norm": 2.417628526687622, - "learning_rate": 4.5641153321596684e-06, - "loss": 0.4515, - "step": 354 - }, - { - "epoch": 1.9189189189189189, - "grad_norm": 2.676739454269409, - "learning_rate": 4.56171715457118e-06, - "loss": 0.8426, - "step": 355 - }, - { - "epoch": 1.9243243243243242, - "grad_norm": 2.8428189754486084, - "learning_rate": 4.559313031525331e-06, - "loss": 0.5806, - "step": 356 - }, - { - "epoch": 1.9297297297297298, - "grad_norm": 2.6817944049835205, - "learning_rate": 4.55690296995499e-06, - "loss": 0.5927, - "step": 357 - }, - { - "epoch": 1.9351351351351351, - "grad_norm": 3.5939931869506836, - "learning_rate": 4.554486976810149e-06, - "loss": 0.9986, - "step": 358 - }, - { - "epoch": 1.9405405405405407, - "grad_norm": 2.86688494682312, - "learning_rate": 4.552065059057906e-06, - "loss": 0.6813, - "step": 359 - }, - { - "epoch": 1.945945945945946, - "grad_norm": 2.9295246601104736, - "learning_rate": 4.549637223682441e-06, - "loss": 1.0832, - "step": 360 - }, - { - "epoch": 1.9513513513513514, - "grad_norm": 2.6939451694488525, - "learning_rate": 4.547203477685005e-06, - "loss": 0.7377, - "step": 361 - }, - { - "epoch": 1.9567567567567568, - "grad_norm": 2.226055145263672, - "learning_rate": 4.544763828083888e-06, - "loss": 0.5412, - "step": 362 - }, - { - "epoch": 1.962162162162162, - "grad_norm": 2.490187406539917, - "learning_rate": 4.542318281914405e-06, - "loss": 0.6955, - "step": 363 - }, - { - "epoch": 1.9675675675675675, - "grad_norm": 2.9241302013397217, - "learning_rate": 4.53986684622888e-06, - "loss": 0.6774, - "step": 364 - }, - { - "epoch": 1.972972972972973, - "grad_norm": 2.988084554672241, - "learning_rate": 4.537409528096615e-06, - "loss": 0.5832, - "step": 365 - }, - { - "epoch": 1.9783783783783784, - "grad_norm": 2.9380626678466797, - "learning_rate": 4.534946334603879e-06, - "loss": 0.606, - "step": 366 - }, - { - "epoch": 1.983783783783784, - "grad_norm": 2.667588710784912, - "learning_rate": 4.532477272853882e-06, - "loss": 0.4991, - "step": 367 - }, - { - "epoch": 1.9891891891891893, - "grad_norm": 2.9711899757385254, - "learning_rate": 4.530002349966759e-06, - "loss": 0.4442, - "step": 368 - }, - { - "epoch": 1.9945945945945946, - "grad_norm": 3.443957805633545, - "learning_rate": 4.5275215730795445e-06, - "loss": 0.6566, - "step": 369 - }, - { - "epoch": 2.0, - "grad_norm": 3.590317487716675, - "learning_rate": 4.525034949346156e-06, - "loss": 0.5687, - "step": 370 - } - ], - "logging_steps": 1, - "max_steps": 1850, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 9.981871016797798e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00001-of-00007.safetensors deleted file mode 100644 index 28e5dfe6783a7caaeab8a9dac4bf120cf51726ab..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d895285aa7705cb9f416eb525b305bb7d99c8669cf0b6143a1685476d9be51f -size 4886466168 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00002-of-00007.safetensors deleted file mode 100644 index 75ea4a2aec429df68544cc6755ff8db3ecf98102..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cbeaba098155300635ce2da33a10501ab354bd71a1cc47a6aa3b5084e7538bb -size 4832007448 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00003-of-00007.safetensors deleted file mode 100644 index 56e4bba0551948d895e52443ad81e2117c321384..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:231b10d1607be9cc9cfceda89dfd402f79487e6b1e6a3ed66eb335cbae0c3cbf -size 4999813112 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00004-of-00007.safetensors deleted file mode 100644 index 050ba82beefe782052db45186c92468070e42bc2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca5dae03166ef6957dcf4764b46c495616526c552986bd3ac7b1dc0aef30b156 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00005-of-00007.safetensors deleted file mode 100644 index a01d20d2511718c6748a828c145b3276e38daa1e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:326b8236011aa89d25dcce253841a2451888113bd24f9b3e66b9dcdeb0b7198c -size 4832007496 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00006-of-00007.safetensors deleted file mode 100644 index 7581b97c00eeed021a864e054d0b6b01259e28ab..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba38ef24ebf07e15911b60fe88f64d7e756d70132c6f37080ae7e457a48301cd -size 4999813120 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00007-of-00007.safetensors deleted file mode 100644 index f389dae84d6bfbad299b1a63a52d07b817bc281b..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a513a44459b557ebf8c09ffb531ffd886ed3657b74644b5674f4e36f524e0aa6 -size 2571158184 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_0.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_0.pth deleted file mode 100644 index 5a7c482c30381cd512ccc35fe322d8a34fbf5207..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:308f94f9a5c24e1bad5c393d56ae7af7782600f4e791d9c6ac35b22fff2105b6 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_1.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_1.pth deleted file mode 100644 index 7b862c21b28bbd89ce6b4fb681d41be05f175599..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b056f3c23cb32dc77a2ec9e7651e0b64e4440e21f0fdf969b86bfc56a1cbdf06 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_2.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_2.pth deleted file mode 100644 index d86ce886844e0298f058d67065e5eeb27ffe7e48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3f8a05714bc528f4885a2816181652f2303b3e8150f89b56aaee6bec56aa520 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_3.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_3.pth deleted file mode 100644 index 10733f5da657367adf3f67760028644c0839660f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f755bd3c330281961e5c03af9d10ce8c1e1678619d384f6f1fd5fd7dce2ff50 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/scheduler.pt b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/scheduler.pt deleted file mode 100644 index b5ef5d5bc69688d1ea8e6d5ad023f05d565e28ec..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc4feab59fd9564b63b496a3407935222b59dba4dce2c00001693dbc56989f23 -size 1064 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/tokenizer.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-555/trainer_state.json deleted file mode 100644 index 042bc248c0f7824b92dfcdf8ad456d3477a0a314..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-555/trainer_state.json +++ /dev/null @@ -1,3919 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 3.0, - "eval_steps": 500, - "global_step": 555, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.005405405405405406, - "grad_norm": 72.60939025878906, - "learning_rate": 5e-06, - "loss": 2.9165, - "step": 1 - }, - { - "epoch": 0.010810810810810811, - "grad_norm": 29.01830291748047, - "learning_rate": 4.999996395324314e-06, - "loss": 1.9314, - "step": 2 - }, - { - "epoch": 0.016216216216216217, - "grad_norm": 21.44908332824707, - "learning_rate": 4.99998558130765e-06, - "loss": 1.5709, - "step": 3 - }, - { - "epoch": 0.021621621621621623, - "grad_norm": 4.490907669067383, - "learning_rate": 4.999967557981192e-06, - "loss": 0.8099, - "step": 4 - }, - { - "epoch": 0.02702702702702703, - "grad_norm": 4.000796794891357, - "learning_rate": 4.999942325396917e-06, - "loss": 0.9021, - "step": 5 - }, - { - "epoch": 0.032432432432432434, - "grad_norm": 18.513282775878906, - "learning_rate": 4.999909883627588e-06, - "loss": 1.7972, - "step": 6 - }, - { - "epoch": 0.03783783783783784, - "grad_norm": 3.5735981464385986, - "learning_rate": 4.999870232766757e-06, - "loss": 1.4306, - "step": 7 - }, - { - "epoch": 0.043243243243243246, - "grad_norm": 3.1145193576812744, - "learning_rate": 4.9998233729287696e-06, - "loss": 1.051, - "step": 8 - }, - { - "epoch": 0.04864864864864865, - "grad_norm": 3.856376886367798, - "learning_rate": 4.999769304248755e-06, - "loss": 0.8089, - "step": 9 - }, - { - "epoch": 0.05405405405405406, - "grad_norm": 4.05589485168457, - "learning_rate": 4.9997080268826344e-06, - "loss": 1.0999, - "step": 10 - }, - { - "epoch": 0.05945945945945946, - "grad_norm": 13.784229278564453, - "learning_rate": 4.9996395410071165e-06, - "loss": 1.2831, - "step": 11 - }, - { - "epoch": 0.06486486486486487, - "grad_norm": 6.079237937927246, - "learning_rate": 4.999563846819696e-06, - "loss": 1.2874, - "step": 12 - }, - { - "epoch": 0.07027027027027027, - "grad_norm": 4.5971245765686035, - "learning_rate": 4.999480944538655e-06, - "loss": 0.96, - "step": 13 - }, - { - "epoch": 0.07567567567567568, - "grad_norm": 4.916017532348633, - "learning_rate": 4.999390834403063e-06, - "loss": 0.9869, - "step": 14 - }, - { - "epoch": 0.08108108108108109, - "grad_norm": 3.2311055660247803, - "learning_rate": 4.999293516672773e-06, - "loss": 0.9293, - "step": 15 - }, - { - "epoch": 0.08648648648648649, - "grad_norm": 3.3040921688079834, - "learning_rate": 4.9991889916284255e-06, - "loss": 0.8914, - "step": 16 - }, - { - "epoch": 0.0918918918918919, - "grad_norm": 3.794267416000366, - "learning_rate": 4.999077259571442e-06, - "loss": 1.0176, - "step": 17 - }, - { - "epoch": 0.0972972972972973, - "grad_norm": 4.788509845733643, - "learning_rate": 4.998958320824031e-06, - "loss": 1.0259, - "step": 18 - }, - { - "epoch": 0.10270270270270271, - "grad_norm": 10.027527809143066, - "learning_rate": 4.998832175729179e-06, - "loss": 1.3356, - "step": 19 - }, - { - "epoch": 0.10810810810810811, - "grad_norm": 4.612483978271484, - "learning_rate": 4.998698824650656e-06, - "loss": 1.4486, - "step": 20 - }, - { - "epoch": 0.11351351351351352, - "grad_norm": 3.8676936626434326, - "learning_rate": 4.998558267973014e-06, - "loss": 0.8372, - "step": 21 - }, - { - "epoch": 0.11891891891891893, - "grad_norm": 2.9611001014709473, - "learning_rate": 4.998410506101579e-06, - "loss": 0.7931, - "step": 22 - }, - { - "epoch": 0.12432432432432433, - "grad_norm": 5.508745193481445, - "learning_rate": 4.9982555394624595e-06, - "loss": 1.3022, - "step": 23 - }, - { - "epoch": 0.12972972972972974, - "grad_norm": 3.434845209121704, - "learning_rate": 4.998093368502539e-06, - "loss": 0.9739, - "step": 24 - }, - { - "epoch": 0.13513513513513514, - "grad_norm": 4.736802101135254, - "learning_rate": 4.9979239936894765e-06, - "loss": 1.1154, - "step": 25 - }, - { - "epoch": 0.14054054054054055, - "grad_norm": 3.69411039352417, - "learning_rate": 4.997747415511705e-06, - "loss": 0.7543, - "step": 26 - }, - { - "epoch": 0.14594594594594595, - "grad_norm": 2.8646645545959473, - "learning_rate": 4.997563634478428e-06, - "loss": 0.7278, - "step": 27 - }, - { - "epoch": 0.15135135135135136, - "grad_norm": 6.56904935836792, - "learning_rate": 4.997372651119626e-06, - "loss": 0.8167, - "step": 28 - }, - { - "epoch": 0.15675675675675677, - "grad_norm": 2.955914258956909, - "learning_rate": 4.997174465986044e-06, - "loss": 0.8031, - "step": 29 - }, - { - "epoch": 0.16216216216216217, - "grad_norm": 2.5714259147644043, - "learning_rate": 4.996969079649196e-06, - "loss": 0.689, - "step": 30 - }, - { - "epoch": 0.16756756756756758, - "grad_norm": 3.5165364742279053, - "learning_rate": 4.996756492701362e-06, - "loss": 0.8059, - "step": 31 - }, - { - "epoch": 0.17297297297297298, - "grad_norm": 3.2861921787261963, - "learning_rate": 4.996536705755591e-06, - "loss": 0.9658, - "step": 32 - }, - { - "epoch": 0.1783783783783784, - "grad_norm": 2.962470531463623, - "learning_rate": 4.996309719445687e-06, - "loss": 0.8349, - "step": 33 - }, - { - "epoch": 0.1837837837837838, - "grad_norm": 2.7694804668426514, - "learning_rate": 4.996075534426223e-06, - "loss": 0.8287, - "step": 34 - }, - { - "epoch": 0.1891891891891892, - "grad_norm": 3.405071258544922, - "learning_rate": 4.995834151372526e-06, - "loss": 1.1211, - "step": 35 - }, - { - "epoch": 0.1945945945945946, - "grad_norm": 2.8680710792541504, - "learning_rate": 4.995585570980685e-06, - "loss": 1.0841, - "step": 36 - }, - { - "epoch": 0.2, - "grad_norm": 3.341021776199341, - "learning_rate": 4.995329793967537e-06, - "loss": 0.6182, - "step": 37 - }, - { - "epoch": 0.20540540540540542, - "grad_norm": 3.0639379024505615, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.7647, - "step": 38 - }, - { - "epoch": 0.21081081081081082, - "grad_norm": 3.225759983062744, - "learning_rate": 4.994796653048457e-06, - "loss": 0.8691, - "step": 39 - }, - { - "epoch": 0.21621621621621623, - "grad_norm": 4.56926155090332, - "learning_rate": 4.994519290679965e-06, - "loss": 1.0404, - "step": 40 - }, - { - "epoch": 0.22162162162162163, - "grad_norm": 4.871571063995361, - "learning_rate": 4.994234734765043e-06, - "loss": 1.1877, - "step": 41 - }, - { - "epoch": 0.22702702702702704, - "grad_norm": 3.672215700149536, - "learning_rate": 4.993942986124278e-06, - "loss": 0.959, - "step": 42 - }, - { - "epoch": 0.23243243243243245, - "grad_norm": 3.184683322906494, - "learning_rate": 4.9936440455989975e-06, - "loss": 0.9249, - "step": 43 - }, - { - "epoch": 0.23783783783783785, - "grad_norm": 2.7092034816741943, - "learning_rate": 4.993337914051266e-06, - "loss": 0.6899, - "step": 44 - }, - { - "epoch": 0.24324324324324326, - "grad_norm": 3.153764486312866, - "learning_rate": 4.99302459236389e-06, - "loss": 0.9075, - "step": 45 - }, - { - "epoch": 0.24864864864864866, - "grad_norm": 3.3629748821258545, - "learning_rate": 4.992704081440407e-06, - "loss": 0.785, - "step": 46 - }, - { - "epoch": 0.25405405405405407, - "grad_norm": 4.478365898132324, - "learning_rate": 4.992376382205088e-06, - "loss": 1.008, - "step": 47 - }, - { - "epoch": 0.2594594594594595, - "grad_norm": 3.4001641273498535, - "learning_rate": 4.992041495602932e-06, - "loss": 0.7751, - "step": 48 - }, - { - "epoch": 0.2648648648648649, - "grad_norm": 2.522662878036499, - "learning_rate": 4.991699422599664e-06, - "loss": 0.9022, - "step": 49 - }, - { - "epoch": 0.2702702702702703, - "grad_norm": 2.764458179473877, - "learning_rate": 4.991350164181735e-06, - "loss": 0.8801, - "step": 50 - }, - { - "epoch": 0.2756756756756757, - "grad_norm": 2.814859628677368, - "learning_rate": 4.990993721356317e-06, - "loss": 0.7045, - "step": 51 - }, - { - "epoch": 0.2810810810810811, - "grad_norm": 2.441311836242676, - "learning_rate": 4.990630095151296e-06, - "loss": 0.7312, - "step": 52 - }, - { - "epoch": 0.2864864864864865, - "grad_norm": 2.4443013668060303, - "learning_rate": 4.9902592866152765e-06, - "loss": 0.9609, - "step": 53 - }, - { - "epoch": 0.2918918918918919, - "grad_norm": 2.2934701442718506, - "learning_rate": 4.989881296817575e-06, - "loss": 0.5753, - "step": 54 - }, - { - "epoch": 0.2972972972972973, - "grad_norm": 2.6286847591400146, - "learning_rate": 4.989496126848215e-06, - "loss": 0.5118, - "step": 55 - }, - { - "epoch": 0.3027027027027027, - "grad_norm": 3.6817069053649902, - "learning_rate": 4.989103777817928e-06, - "loss": 1.1261, - "step": 56 - }, - { - "epoch": 0.3081081081081081, - "grad_norm": 3.011197566986084, - "learning_rate": 4.988704250858145e-06, - "loss": 0.7823, - "step": 57 - }, - { - "epoch": 0.31351351351351353, - "grad_norm": 2.5490806102752686, - "learning_rate": 4.988297547121e-06, - "loss": 0.6019, - "step": 58 - }, - { - "epoch": 0.31891891891891894, - "grad_norm": 3.0803146362304688, - "learning_rate": 4.98788366777932e-06, - "loss": 0.825, - "step": 59 - }, - { - "epoch": 0.32432432432432434, - "grad_norm": 3.015730619430542, - "learning_rate": 4.987462614026625e-06, - "loss": 0.7667, - "step": 60 - }, - { - "epoch": 0.32972972972972975, - "grad_norm": 2.5371594429016113, - "learning_rate": 4.987034387077126e-06, - "loss": 0.8051, - "step": 61 - }, - { - "epoch": 0.33513513513513515, - "grad_norm": 2.6414010524749756, - "learning_rate": 4.986598988165718e-06, - "loss": 0.6895, - "step": 62 - }, - { - "epoch": 0.34054054054054056, - "grad_norm": 3.065131187438965, - "learning_rate": 4.9861564185479785e-06, - "loss": 0.9268, - "step": 63 - }, - { - "epoch": 0.34594594594594597, - "grad_norm": 2.5708694458007812, - "learning_rate": 4.985706679500163e-06, - "loss": 0.9854, - "step": 64 - }, - { - "epoch": 0.35135135135135137, - "grad_norm": 2.768915891647339, - "learning_rate": 4.9852497723192025e-06, - "loss": 0.8083, - "step": 65 - }, - { - "epoch": 0.3567567567567568, - "grad_norm": 2.567901849746704, - "learning_rate": 4.9847856983227e-06, - "loss": 0.9098, - "step": 66 - }, - { - "epoch": 0.3621621621621622, - "grad_norm": 2.5766549110412598, - "learning_rate": 4.984314458848923e-06, - "loss": 0.8881, - "step": 67 - }, - { - "epoch": 0.3675675675675676, - "grad_norm": 2.9778389930725098, - "learning_rate": 4.983836055256804e-06, - "loss": 0.9877, - "step": 68 - }, - { - "epoch": 0.372972972972973, - "grad_norm": 2.7225165367126465, - "learning_rate": 4.983350488925935e-06, - "loss": 0.8282, - "step": 69 - }, - { - "epoch": 0.3783783783783784, - "grad_norm": 2.702287197113037, - "learning_rate": 4.982857761256564e-06, - "loss": 1.1756, - "step": 70 - }, - { - "epoch": 0.3837837837837838, - "grad_norm": 2.9815568923950195, - "learning_rate": 4.982357873669589e-06, - "loss": 0.8114, - "step": 71 - }, - { - "epoch": 0.3891891891891892, - "grad_norm": 3.27150297164917, - "learning_rate": 4.981850827606556e-06, - "loss": 0.6763, - "step": 72 - }, - { - "epoch": 0.3945945945945946, - "grad_norm": 2.568423271179199, - "learning_rate": 4.981336624529655e-06, - "loss": 0.9372, - "step": 73 - }, - { - "epoch": 0.4, - "grad_norm": 2.621175527572632, - "learning_rate": 4.980815265921714e-06, - "loss": 1.0155, - "step": 74 - }, - { - "epoch": 0.40540540540540543, - "grad_norm": 2.62827205657959, - "learning_rate": 4.980286753286196e-06, - "loss": 0.949, - "step": 75 - }, - { - "epoch": 0.41081081081081083, - "grad_norm": 2.9462146759033203, - "learning_rate": 4.979751088147192e-06, - "loss": 1.0134, - "step": 76 - }, - { - "epoch": 0.41621621621621624, - "grad_norm": 2.814852714538574, - "learning_rate": 4.979208272049425e-06, - "loss": 0.9722, - "step": 77 - }, - { - "epoch": 0.42162162162162165, - "grad_norm": 4.177679538726807, - "learning_rate": 4.978658306558235e-06, - "loss": 1.2259, - "step": 78 - }, - { - "epoch": 0.42702702702702705, - "grad_norm": 2.813084125518799, - "learning_rate": 4.978101193259578e-06, - "loss": 0.834, - "step": 79 - }, - { - "epoch": 0.43243243243243246, - "grad_norm": 2.71824049949646, - "learning_rate": 4.977536933760025e-06, - "loss": 0.6151, - "step": 80 - }, - { - "epoch": 0.43783783783783786, - "grad_norm": 4.992153167724609, - "learning_rate": 4.976965529686755e-06, - "loss": 1.0475, - "step": 81 - }, - { - "epoch": 0.44324324324324327, - "grad_norm": 2.4810822010040283, - "learning_rate": 4.976386982687548e-06, - "loss": 0.8324, - "step": 82 - }, - { - "epoch": 0.4486486486486487, - "grad_norm": 4.509149074554443, - "learning_rate": 4.9758012944307845e-06, - "loss": 0.997, - "step": 83 - }, - { - "epoch": 0.4540540540540541, - "grad_norm": 3.114325761795044, - "learning_rate": 4.975208466605436e-06, - "loss": 1.2024, - "step": 84 - }, - { - "epoch": 0.4594594594594595, - "grad_norm": 3.297091007232666, - "learning_rate": 4.974608500921064e-06, - "loss": 0.9146, - "step": 85 - }, - { - "epoch": 0.4648648648648649, - "grad_norm": 2.824475049972534, - "learning_rate": 4.974001399107816e-06, - "loss": 0.7181, - "step": 86 - }, - { - "epoch": 0.4702702702702703, - "grad_norm": 20.262290954589844, - "learning_rate": 4.973387162916415e-06, - "loss": 0.8599, - "step": 87 - }, - { - "epoch": 0.4756756756756757, - "grad_norm": 4.015744686126709, - "learning_rate": 4.972765794118158e-06, - "loss": 0.6081, - "step": 88 - }, - { - "epoch": 0.4810810810810811, - "grad_norm": 2.8033058643341064, - "learning_rate": 4.9721372945049114e-06, - "loss": 0.8764, - "step": 89 - }, - { - "epoch": 0.4864864864864865, - "grad_norm": 5.271846294403076, - "learning_rate": 4.971501665889107e-06, - "loss": 0.8622, - "step": 90 - }, - { - "epoch": 0.4918918918918919, - "grad_norm": 2.557264804840088, - "learning_rate": 4.9708589101037306e-06, - "loss": 0.5523, - "step": 91 - }, - { - "epoch": 0.4972972972972973, - "grad_norm": 4.342173099517822, - "learning_rate": 4.970209029002325e-06, - "loss": 0.8922, - "step": 92 - }, - { - "epoch": 0.5027027027027027, - "grad_norm": 2.950364351272583, - "learning_rate": 4.969552024458977e-06, - "loss": 0.9455, - "step": 93 - }, - { - "epoch": 0.5081081081081081, - "grad_norm": 2.6453042030334473, - "learning_rate": 4.968887898368318e-06, - "loss": 0.8342, - "step": 94 - }, - { - "epoch": 0.5135135135135135, - "grad_norm": 3.486766815185547, - "learning_rate": 4.968216652645515e-06, - "loss": 0.8476, - "step": 95 - }, - { - "epoch": 0.518918918918919, - "grad_norm": 2.884152889251709, - "learning_rate": 4.967538289226268e-06, - "loss": 0.8879, - "step": 96 - }, - { - "epoch": 0.5243243243243243, - "grad_norm": 2.4130594730377197, - "learning_rate": 4.966852810066798e-06, - "loss": 0.7114, - "step": 97 - }, - { - "epoch": 0.5297297297297298, - "grad_norm": 3.182410955429077, - "learning_rate": 4.9661602171438524e-06, - "loss": 0.6757, - "step": 98 - }, - { - "epoch": 0.5351351351351351, - "grad_norm": 2.5027542114257812, - "learning_rate": 4.965460512454687e-06, - "loss": 0.8029, - "step": 99 - }, - { - "epoch": 0.5405405405405406, - "grad_norm": 2.3096024990081787, - "learning_rate": 4.964753698017071e-06, - "loss": 0.842, - "step": 100 - }, - { - "epoch": 0.5459459459459459, - "grad_norm": 2.875657081604004, - "learning_rate": 4.964039775869271e-06, - "loss": 0.6339, - "step": 101 - }, - { - "epoch": 0.5513513513513514, - "grad_norm": 2.505406141281128, - "learning_rate": 4.963318748070056e-06, - "loss": 0.7743, - "step": 102 - }, - { - "epoch": 0.5567567567567567, - "grad_norm": 3.552562713623047, - "learning_rate": 4.9625906166986815e-06, - "loss": 0.926, - "step": 103 - }, - { - "epoch": 0.5621621621621622, - "grad_norm": 2.717942476272583, - "learning_rate": 4.961855383854889e-06, - "loss": 0.7037, - "step": 104 - }, - { - "epoch": 0.5675675675675675, - "grad_norm": 2.5049386024475098, - "learning_rate": 4.961113051658901e-06, - "loss": 0.561, - "step": 105 - }, - { - "epoch": 0.572972972972973, - "grad_norm": 2.3112900257110596, - "learning_rate": 4.96036362225141e-06, - "loss": 0.7316, - "step": 106 - }, - { - "epoch": 0.5783783783783784, - "grad_norm": 2.470257520675659, - "learning_rate": 4.959607097793575e-06, - "loss": 0.6426, - "step": 107 - }, - { - "epoch": 0.5837837837837838, - "grad_norm": 3.8040788173675537, - "learning_rate": 4.9588434804670176e-06, - "loss": 1.0044, - "step": 108 - }, - { - "epoch": 0.5891891891891892, - "grad_norm": 3.143547296524048, - "learning_rate": 4.958072772473812e-06, - "loss": 0.9219, - "step": 109 - }, - { - "epoch": 0.5945945945945946, - "grad_norm": 3.5052590370178223, - "learning_rate": 4.9572949760364795e-06, - "loss": 0.6056, - "step": 110 - }, - { - "epoch": 0.6, - "grad_norm": 3.064009428024292, - "learning_rate": 4.9565100933979835e-06, - "loss": 0.6346, - "step": 111 - }, - { - "epoch": 0.6054054054054054, - "grad_norm": 2.694610595703125, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.9856, - "step": 112 - }, - { - "epoch": 0.6108108108108108, - "grad_norm": 2.5885775089263916, - "learning_rate": 4.954919078591521e-06, - "loss": 0.8669, - "step": 113 - }, - { - "epoch": 0.6162162162162163, - "grad_norm": 2.593609571456909, - "learning_rate": 4.954112951011628e-06, - "loss": 0.7201, - "step": 114 - }, - { - "epoch": 0.6216216216216216, - "grad_norm": 3.3045759201049805, - "learning_rate": 4.9532997464067065e-06, - "loss": 0.9095, - "step": 115 - }, - { - "epoch": 0.6270270270270271, - "grad_norm": 2.8144869804382324, - "learning_rate": 4.952479467121828e-06, - "loss": 1.0213, - "step": 116 - }, - { - "epoch": 0.6324324324324324, - "grad_norm": 2.5460312366485596, - "learning_rate": 4.951652115522463e-06, - "loss": 1.1154, - "step": 117 - }, - { - "epoch": 0.6378378378378379, - "grad_norm": 2.795137405395508, - "learning_rate": 4.950817693994481e-06, - "loss": 0.691, - "step": 118 - }, - { - "epoch": 0.6432432432432432, - "grad_norm": 2.4979195594787598, - "learning_rate": 4.949976204944135e-06, - "loss": 0.7224, - "step": 119 - }, - { - "epoch": 0.6486486486486487, - "grad_norm": 3.3131983280181885, - "learning_rate": 4.949127650798063e-06, - "loss": 0.9256, - "step": 120 - }, - { - "epoch": 0.654054054054054, - "grad_norm": 2.9060285091400146, - "learning_rate": 4.948272034003275e-06, - "loss": 0.6892, - "step": 121 - }, - { - "epoch": 0.6594594594594595, - "grad_norm": 3.695594549179077, - "learning_rate": 4.947409357027148e-06, - "loss": 0.5878, - "step": 122 - }, - { - "epoch": 0.6648648648648648, - "grad_norm": 3.1250460147857666, - "learning_rate": 4.9465396223574165e-06, - "loss": 0.9904, - "step": 123 - }, - { - "epoch": 0.6702702702702703, - "grad_norm": 4.024891376495361, - "learning_rate": 4.945662832502172e-06, - "loss": 1.1592, - "step": 124 - }, - { - "epoch": 0.6756756756756757, - "grad_norm": 2.6886494159698486, - "learning_rate": 4.944778989989847e-06, - "loss": 1.0041, - "step": 125 - }, - { - "epoch": 0.6810810810810811, - "grad_norm": 2.366912841796875, - "learning_rate": 4.943888097369216e-06, - "loss": 0.7045, - "step": 126 - }, - { - "epoch": 0.6864864864864865, - "grad_norm": 2.394932270050049, - "learning_rate": 4.942990157209381e-06, - "loss": 0.6685, - "step": 127 - }, - { - "epoch": 0.6918918918918919, - "grad_norm": 2.61933970451355, - "learning_rate": 4.9420851720997674e-06, - "loss": 0.8812, - "step": 128 - }, - { - "epoch": 0.6972972972972973, - "grad_norm": 2.7395646572113037, - "learning_rate": 4.94117314465012e-06, - "loss": 1.3014, - "step": 129 - }, - { - "epoch": 0.7027027027027027, - "grad_norm": 3.065484046936035, - "learning_rate": 4.940254077490487e-06, - "loss": 0.6978, - "step": 130 - }, - { - "epoch": 0.7081081081081081, - "grad_norm": 2.895038366317749, - "learning_rate": 4.939327973271222e-06, - "loss": 0.6249, - "step": 131 - }, - { - "epoch": 0.7135135135135136, - "grad_norm": 3.1773312091827393, - "learning_rate": 4.9383948346629665e-06, - "loss": 0.6423, - "step": 132 - }, - { - "epoch": 0.7189189189189189, - "grad_norm": 2.2378008365631104, - "learning_rate": 4.937454664356652e-06, - "loss": 0.7193, - "step": 133 - }, - { - "epoch": 0.7243243243243244, - "grad_norm": 2.5673701763153076, - "learning_rate": 4.9365074650634855e-06, - "loss": 0.7065, - "step": 134 - }, - { - "epoch": 0.7297297297297297, - "grad_norm": 2.7348387241363525, - "learning_rate": 4.9355532395149445e-06, - "loss": 1.0046, - "step": 135 - }, - { - "epoch": 0.7351351351351352, - "grad_norm": 2.391741991043091, - "learning_rate": 4.9345919904627655e-06, - "loss": 0.6771, - "step": 136 - }, - { - "epoch": 0.7405405405405405, - "grad_norm": 2.2096705436706543, - "learning_rate": 4.933623720678944e-06, - "loss": 0.6589, - "step": 137 - }, - { - "epoch": 0.745945945945946, - "grad_norm": 3.0840072631835938, - "learning_rate": 4.932648432955718e-06, - "loss": 0.8755, - "step": 138 - }, - { - "epoch": 0.7513513513513513, - "grad_norm": 2.4970428943634033, - "learning_rate": 4.931666130105564e-06, - "loss": 0.6685, - "step": 139 - }, - { - "epoch": 0.7567567567567568, - "grad_norm": 4.315455436706543, - "learning_rate": 4.930676814961189e-06, - "loss": 0.8101, - "step": 140 - }, - { - "epoch": 0.7621621621621621, - "grad_norm": 5.388065814971924, - "learning_rate": 4.92968049037552e-06, - "loss": 0.8193, - "step": 141 - }, - { - "epoch": 0.7675675675675676, - "grad_norm": 2.6107139587402344, - "learning_rate": 4.9286771592217005e-06, - "loss": 0.7852, - "step": 142 - }, - { - "epoch": 0.772972972972973, - "grad_norm": 3.936556577682495, - "learning_rate": 4.927666824393076e-06, - "loss": 1.0388, - "step": 143 - }, - { - "epoch": 0.7783783783783784, - "grad_norm": 2.74424409866333, - "learning_rate": 4.926649488803191e-06, - "loss": 0.8266, - "step": 144 - }, - { - "epoch": 0.7837837837837838, - "grad_norm": 2.8998451232910156, - "learning_rate": 4.925625155385776e-06, - "loss": 0.4895, - "step": 145 - }, - { - "epoch": 0.7891891891891892, - "grad_norm": 3.0631520748138428, - "learning_rate": 4.924593827094743e-06, - "loss": 0.8759, - "step": 146 - }, - { - "epoch": 0.7945945945945946, - "grad_norm": 3.233267307281494, - "learning_rate": 4.923555506904176e-06, - "loss": 0.701, - "step": 147 - }, - { - "epoch": 0.8, - "grad_norm": 2.87701416015625, - "learning_rate": 4.922510197808321e-06, - "loss": 1.1327, - "step": 148 - }, - { - "epoch": 0.8054054054054054, - "grad_norm": 3.650576114654541, - "learning_rate": 4.921457902821578e-06, - "loss": 0.7587, - "step": 149 - }, - { - "epoch": 0.8108108108108109, - "grad_norm": 3.232112407684326, - "learning_rate": 4.920398624978493e-06, - "loss": 1.2158, - "step": 150 - }, - { - "epoch": 0.8162162162162162, - "grad_norm": 2.468384027481079, - "learning_rate": 4.919332367333748e-06, - "loss": 0.6852, - "step": 151 - }, - { - "epoch": 0.8216216216216217, - "grad_norm": 2.5947415828704834, - "learning_rate": 4.918259132962154e-06, - "loss": 0.6611, - "step": 152 - }, - { - "epoch": 0.827027027027027, - "grad_norm": 3.0171427726745605, - "learning_rate": 4.917178924958638e-06, - "loss": 0.7327, - "step": 153 - }, - { - "epoch": 0.8324324324324325, - "grad_norm": 3.293184518814087, - "learning_rate": 4.916091746438243e-06, - "loss": 0.8528, - "step": 154 - }, - { - "epoch": 0.8378378378378378, - "grad_norm": 4.0570969581604, - "learning_rate": 4.9149976005361085e-06, - "loss": 0.9141, - "step": 155 - }, - { - "epoch": 0.8432432432432433, - "grad_norm": 2.8782784938812256, - "learning_rate": 4.913896490407467e-06, - "loss": 1.1132, - "step": 156 - }, - { - "epoch": 0.8486486486486486, - "grad_norm": 2.5671517848968506, - "learning_rate": 4.912788419227635e-06, - "loss": 0.7587, - "step": 157 - }, - { - "epoch": 0.8540540540540541, - "grad_norm": 2.9445390701293945, - "learning_rate": 4.911673390192002e-06, - "loss": 0.9227, - "step": 158 - }, - { - "epoch": 0.8594594594594595, - "grad_norm": 2.472595453262329, - "learning_rate": 4.910551406516023e-06, - "loss": 0.8154, - "step": 159 - }, - { - "epoch": 0.8648648648648649, - "grad_norm": 2.5233397483825684, - "learning_rate": 4.909422471435207e-06, - "loss": 0.9897, - "step": 160 - }, - { - "epoch": 0.8702702702702703, - "grad_norm": 3.3919546604156494, - "learning_rate": 4.90828658820511e-06, - "loss": 0.6162, - "step": 161 - }, - { - "epoch": 0.8756756756756757, - "grad_norm": 3.060908555984497, - "learning_rate": 4.907143760101325e-06, - "loss": 0.5734, - "step": 162 - }, - { - "epoch": 0.8810810810810811, - "grad_norm": 3.4584782123565674, - "learning_rate": 4.905993990419472e-06, - "loss": 0.8328, - "step": 163 - }, - { - "epoch": 0.8864864864864865, - "grad_norm": 2.936570644378662, - "learning_rate": 4.904837282475187e-06, - "loss": 0.6787, - "step": 164 - }, - { - "epoch": 0.8918918918918919, - "grad_norm": 2.564837694168091, - "learning_rate": 4.9036736396041165e-06, - "loss": 0.9658, - "step": 165 - }, - { - "epoch": 0.8972972972972973, - "grad_norm": 3.2509360313415527, - "learning_rate": 4.902503065161905e-06, - "loss": 0.7899, - "step": 166 - }, - { - "epoch": 0.9027027027027027, - "grad_norm": 2.9730329513549805, - "learning_rate": 4.901325562524185e-06, - "loss": 0.9476, - "step": 167 - }, - { - "epoch": 0.9081081081081082, - "grad_norm": 3.044980049133301, - "learning_rate": 4.900141135086569e-06, - "loss": 0.7589, - "step": 168 - }, - { - "epoch": 0.9135135135135135, - "grad_norm": 3.030585527420044, - "learning_rate": 4.898949786264638e-06, - "loss": 0.6724, - "step": 169 - }, - { - "epoch": 0.918918918918919, - "grad_norm": 2.249122142791748, - "learning_rate": 4.897751519493933e-06, - "loss": 0.6968, - "step": 170 - }, - { - "epoch": 0.9243243243243243, - "grad_norm": 2.9816982746124268, - "learning_rate": 4.896546338229945e-06, - "loss": 0.7984, - "step": 171 - }, - { - "epoch": 0.9297297297297298, - "grad_norm": 2.415736675262451, - "learning_rate": 4.8953342459481034e-06, - "loss": 0.6109, - "step": 172 - }, - { - "epoch": 0.9351351351351351, - "grad_norm": 2.740518808364868, - "learning_rate": 4.894115246143768e-06, - "loss": 0.8126, - "step": 173 - }, - { - "epoch": 0.9405405405405406, - "grad_norm": 2.7610201835632324, - "learning_rate": 4.892889342332218e-06, - "loss": 0.6862, - "step": 174 - }, - { - "epoch": 0.9459459459459459, - "grad_norm": 3.057025194168091, - "learning_rate": 4.891656538048642e-06, - "loss": 0.9895, - "step": 175 - }, - { - "epoch": 0.9513513513513514, - "grad_norm": 2.569751262664795, - "learning_rate": 4.890416836848128e-06, - "loss": 0.8481, - "step": 176 - }, - { - "epoch": 0.9567567567567568, - "grad_norm": 2.4443397521972656, - "learning_rate": 4.889170242305652e-06, - "loss": 0.6478, - "step": 177 - }, - { - "epoch": 0.9621621621621622, - "grad_norm": 2.5009846687316895, - "learning_rate": 4.887916758016069e-06, - "loss": 0.9714, - "step": 178 - }, - { - "epoch": 0.9675675675675676, - "grad_norm": 3.101975202560425, - "learning_rate": 4.886656387594104e-06, - "loss": 1.1264, - "step": 179 - }, - { - "epoch": 0.972972972972973, - "grad_norm": 2.6144704818725586, - "learning_rate": 4.885389134674338e-06, - "loss": 0.7664, - "step": 180 - }, - { - "epoch": 0.9783783783783784, - "grad_norm": 2.5834381580352783, - "learning_rate": 4.884115002911197e-06, - "loss": 0.6131, - "step": 181 - }, - { - "epoch": 0.9837837837837838, - "grad_norm": 2.5378055572509766, - "learning_rate": 4.88283399597895e-06, - "loss": 0.8733, - "step": 182 - }, - { - "epoch": 0.9891891891891892, - "grad_norm": 2.4095377922058105, - "learning_rate": 4.881546117571686e-06, - "loss": 0.643, - "step": 183 - }, - { - "epoch": 0.9945945945945946, - "grad_norm": 2.9554507732391357, - "learning_rate": 4.8802513714033135e-06, - "loss": 0.7287, - "step": 184 - }, - { - "epoch": 1.0, - "grad_norm": 2.8279213905334473, - "learning_rate": 4.878949761207545e-06, - "loss": 0.9927, - "step": 185 - }, - { - "epoch": 1.0054054054054054, - "grad_norm": 2.9361412525177, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.66, - "step": 186 - }, - { - "epoch": 1.0108108108108107, - "grad_norm": 3.392244338989258, - "learning_rate": 4.876325963767623e-06, - "loss": 0.594, - "step": 187 - }, - { - "epoch": 1.0162162162162163, - "grad_norm": 2.6276044845581055, - "learning_rate": 4.875003784089822e-06, - "loss": 0.5825, - "step": 188 - }, - { - "epoch": 1.0216216216216216, - "grad_norm": 2.2875545024871826, - "learning_rate": 4.873674755517305e-06, - "loss": 0.6594, - "step": 189 - }, - { - "epoch": 1.027027027027027, - "grad_norm": 2.8086795806884766, - "learning_rate": 4.872338881882645e-06, - "loss": 0.7536, - "step": 190 - }, - { - "epoch": 1.0324324324324325, - "grad_norm": 2.3685200214385986, - "learning_rate": 4.870996167038154e-06, - "loss": 0.4849, - "step": 191 - }, - { - "epoch": 1.037837837837838, - "grad_norm": 3.0264766216278076, - "learning_rate": 4.869646614855877e-06, - "loss": 0.3771, - "step": 192 - }, - { - "epoch": 1.0432432432432432, - "grad_norm": 4.335122108459473, - "learning_rate": 4.868290229227567e-06, - "loss": 0.8545, - "step": 193 - }, - { - "epoch": 1.0486486486486486, - "grad_norm": 3.442172050476074, - "learning_rate": 4.866927014064692e-06, - "loss": 0.3698, - "step": 194 - }, - { - "epoch": 1.054054054054054, - "grad_norm": 3.326539993286133, - "learning_rate": 4.86555697329841e-06, - "loss": 0.8468, - "step": 195 - }, - { - "epoch": 1.0594594594594595, - "grad_norm": 3.0372447967529297, - "learning_rate": 4.864180110879562e-06, - "loss": 0.8232, - "step": 196 - }, - { - "epoch": 1.0648648648648649, - "grad_norm": 2.955343008041382, - "learning_rate": 4.862796430778663e-06, - "loss": 0.4097, - "step": 197 - }, - { - "epoch": 1.0702702702702702, - "grad_norm": 2.4095399379730225, - "learning_rate": 4.861405936985889e-06, - "loss": 0.6746, - "step": 198 - }, - { - "epoch": 1.0756756756756758, - "grad_norm": 2.763500452041626, - "learning_rate": 4.860008633511059e-06, - "loss": 0.6605, - "step": 199 - }, - { - "epoch": 1.0810810810810811, - "grad_norm": 2.6751155853271484, - "learning_rate": 4.8586045243836384e-06, - "loss": 0.471, - "step": 200 - }, - { - "epoch": 1.0864864864864865, - "grad_norm": 3.3507862091064453, - "learning_rate": 4.857193613652711e-06, - "loss": 0.7665, - "step": 201 - }, - { - "epoch": 1.0918918918918918, - "grad_norm": 3.3064827919006348, - "learning_rate": 4.8557759053869775e-06, - "loss": 0.6436, - "step": 202 - }, - { - "epoch": 1.0972972972972972, - "grad_norm": 2.571828603744507, - "learning_rate": 4.854351403674741e-06, - "loss": 0.4642, - "step": 203 - }, - { - "epoch": 1.1027027027027028, - "grad_norm": 2.883220911026001, - "learning_rate": 4.852920112623895e-06, - "loss": 0.5737, - "step": 204 - }, - { - "epoch": 1.1081081081081081, - "grad_norm": 3.026144027709961, - "learning_rate": 4.851482036361912e-06, - "loss": 0.7302, - "step": 205 - }, - { - "epoch": 1.1135135135135135, - "grad_norm": 2.6689612865448, - "learning_rate": 4.850037179035829e-06, - "loss": 0.5229, - "step": 206 - }, - { - "epoch": 1.118918918918919, - "grad_norm": 2.4019956588745117, - "learning_rate": 4.8485855448122425e-06, - "loss": 0.5529, - "step": 207 - }, - { - "epoch": 1.1243243243243244, - "grad_norm": 2.3546230792999268, - "learning_rate": 4.847127137877286e-06, - "loss": 0.3635, - "step": 208 - }, - { - "epoch": 1.1297297297297297, - "grad_norm": 2.999096393585205, - "learning_rate": 4.8456619624366285e-06, - "loss": 0.8149, - "step": 209 - }, - { - "epoch": 1.135135135135135, - "grad_norm": 10.072900772094727, - "learning_rate": 4.844190022715456e-06, - "loss": 0.8333, - "step": 210 - }, - { - "epoch": 1.1405405405405404, - "grad_norm": 2.222123384475708, - "learning_rate": 4.84271132295846e-06, - "loss": 0.3717, - "step": 211 - }, - { - "epoch": 1.145945945945946, - "grad_norm": 2.8751113414764404, - "learning_rate": 4.841225867429826e-06, - "loss": 0.5994, - "step": 212 - }, - { - "epoch": 1.1513513513513514, - "grad_norm": 2.9580111503601074, - "learning_rate": 4.839733660413224e-06, - "loss": 0.8382, - "step": 213 - }, - { - "epoch": 1.1567567567567567, - "grad_norm": 4.628892421722412, - "learning_rate": 4.838234706211792e-06, - "loss": 0.818, - "step": 214 - }, - { - "epoch": 1.1621621621621623, - "grad_norm": 2.5103509426116943, - "learning_rate": 4.836729009148124e-06, - "loss": 0.4267, - "step": 215 - }, - { - "epoch": 1.1675675675675676, - "grad_norm": 2.6093738079071045, - "learning_rate": 4.835216573564261e-06, - "loss": 0.3472, - "step": 216 - }, - { - "epoch": 1.172972972972973, - "grad_norm": 3.0792338848114014, - "learning_rate": 4.833697403821672e-06, - "loss": 0.6323, - "step": 217 - }, - { - "epoch": 1.1783783783783783, - "grad_norm": 2.845163345336914, - "learning_rate": 4.8321715043012516e-06, - "loss": 0.6831, - "step": 218 - }, - { - "epoch": 1.1837837837837837, - "grad_norm": 3.0433948040008545, - "learning_rate": 4.830638879403296e-06, - "loss": 0.3682, - "step": 219 - }, - { - "epoch": 1.1891891891891893, - "grad_norm": 2.6533594131469727, - "learning_rate": 4.8290995335475e-06, - "loss": 0.4154, - "step": 220 - }, - { - "epoch": 1.1945945945945946, - "grad_norm": 2.9271352291107178, - "learning_rate": 4.827553471172935e-06, - "loss": 0.3991, - "step": 221 - }, - { - "epoch": 1.2, - "grad_norm": 2.9243528842926025, - "learning_rate": 4.826000696738045e-06, - "loss": 0.4538, - "step": 222 - }, - { - "epoch": 1.2054054054054055, - "grad_norm": 2.537332534790039, - "learning_rate": 4.824441214720629e-06, - "loss": 0.7692, - "step": 223 - }, - { - "epoch": 1.2108108108108109, - "grad_norm": 3.9193246364593506, - "learning_rate": 4.8228750296178275e-06, - "loss": 0.6038, - "step": 224 - }, - { - "epoch": 1.2162162162162162, - "grad_norm": 2.6646728515625, - "learning_rate": 4.821302145946113e-06, - "loss": 0.4147, - "step": 225 - }, - { - "epoch": 1.2216216216216216, - "grad_norm": 2.6519482135772705, - "learning_rate": 4.819722568241274e-06, - "loss": 0.5398, - "step": 226 - }, - { - "epoch": 1.227027027027027, - "grad_norm": 2.2018048763275146, - "learning_rate": 4.818136301058401e-06, - "loss": 0.3864, - "step": 227 - }, - { - "epoch": 1.2324324324324325, - "grad_norm": 2.5660712718963623, - "learning_rate": 4.816543348971879e-06, - "loss": 0.5712, - "step": 228 - }, - { - "epoch": 1.2378378378378379, - "grad_norm": 3.237663745880127, - "learning_rate": 4.814943716575368e-06, - "loss": 0.662, - "step": 229 - }, - { - "epoch": 1.2432432432432432, - "grad_norm": 2.5570430755615234, - "learning_rate": 4.813337408481793e-06, - "loss": 0.8661, - "step": 230 - }, - { - "epoch": 1.2486486486486488, - "grad_norm": 2.9231269359588623, - "learning_rate": 4.811724429323329e-06, - "loss": 0.9218, - "step": 231 - }, - { - "epoch": 1.2540540540540541, - "grad_norm": 3.637084722518921, - "learning_rate": 4.810104783751389e-06, - "loss": 0.5597, - "step": 232 - }, - { - "epoch": 1.2594594594594595, - "grad_norm": 3.0218842029571533, - "learning_rate": 4.8084784764366125e-06, - "loss": 0.4786, - "step": 233 - }, - { - "epoch": 1.2648648648648648, - "grad_norm": 2.770214080810547, - "learning_rate": 4.806845512068846e-06, - "loss": 0.5219, - "step": 234 - }, - { - "epoch": 1.2702702702702702, - "grad_norm": 3.093053102493286, - "learning_rate": 4.805205895357137e-06, - "loss": 0.643, - "step": 235 - }, - { - "epoch": 1.2756756756756757, - "grad_norm": 2.6373348236083984, - "learning_rate": 4.803559631029713e-06, - "loss": 0.5858, - "step": 236 - }, - { - "epoch": 1.281081081081081, - "grad_norm": 2.452030897140503, - "learning_rate": 4.801906723833973e-06, - "loss": 0.4185, - "step": 237 - }, - { - "epoch": 1.2864864864864864, - "grad_norm": 2.72564697265625, - "learning_rate": 4.8002471785364734e-06, - "loss": 0.4917, - "step": 238 - }, - { - "epoch": 1.291891891891892, - "grad_norm": 3.0389158725738525, - "learning_rate": 4.798580999922913e-06, - "loss": 0.645, - "step": 239 - }, - { - "epoch": 1.2972972972972974, - "grad_norm": 3.7002289295196533, - "learning_rate": 4.796908192798117e-06, - "loss": 0.5378, - "step": 240 - }, - { - "epoch": 1.3027027027027027, - "grad_norm": 2.1876111030578613, - "learning_rate": 4.7952287619860276e-06, - "loss": 0.5197, - "step": 241 - }, - { - "epoch": 1.308108108108108, - "grad_norm": 3.903337240219116, - "learning_rate": 4.793542712329689e-06, - "loss": 1.0226, - "step": 242 - }, - { - "epoch": 1.3135135135135134, - "grad_norm": 2.3623552322387695, - "learning_rate": 4.791850048691228e-06, - "loss": 0.5502, - "step": 243 - }, - { - "epoch": 1.318918918918919, - "grad_norm": 3.0669031143188477, - "learning_rate": 4.79015077595185e-06, - "loss": 0.6976, - "step": 244 - }, - { - "epoch": 1.3243243243243243, - "grad_norm": 3.1480472087860107, - "learning_rate": 4.788444899011816e-06, - "loss": 0.4795, - "step": 245 - }, - { - "epoch": 1.3297297297297297, - "grad_norm": 3.7051920890808105, - "learning_rate": 4.786732422790432e-06, - "loss": 0.6526, - "step": 246 - }, - { - "epoch": 1.3351351351351353, - "grad_norm": 3.4358389377593994, - "learning_rate": 4.785013352226036e-06, - "loss": 0.5551, - "step": 247 - }, - { - "epoch": 1.3405405405405406, - "grad_norm": 2.3789355754852295, - "learning_rate": 4.7832876922759805e-06, - "loss": 0.3151, - "step": 248 - }, - { - "epoch": 1.345945945945946, - "grad_norm": 2.4843716621398926, - "learning_rate": 4.781555447916622e-06, - "loss": 0.6713, - "step": 249 - }, - { - "epoch": 1.3513513513513513, - "grad_norm": 3.0176303386688232, - "learning_rate": 4.779816624143302e-06, - "loss": 0.437, - "step": 250 - }, - { - "epoch": 1.3567567567567567, - "grad_norm": 2.868350028991699, - "learning_rate": 4.77807122597034e-06, - "loss": 0.7632, - "step": 251 - }, - { - "epoch": 1.3621621621621622, - "grad_norm": 2.4629738330841064, - "learning_rate": 4.776319258431009e-06, - "loss": 0.4894, - "step": 252 - }, - { - "epoch": 1.3675675675675676, - "grad_norm": 2.798297882080078, - "learning_rate": 4.77456072657753e-06, - "loss": 0.4456, - "step": 253 - }, - { - "epoch": 1.372972972972973, - "grad_norm": 3.2977547645568848, - "learning_rate": 4.772795635481053e-06, - "loss": 0.5381, - "step": 254 - }, - { - "epoch": 1.3783783783783785, - "grad_norm": 4.1061906814575195, - "learning_rate": 4.77102399023164e-06, - "loss": 1.0302, - "step": 255 - }, - { - "epoch": 1.3837837837837839, - "grad_norm": 3.943284511566162, - "learning_rate": 4.769245795938261e-06, - "loss": 0.4875, - "step": 256 - }, - { - "epoch": 1.3891891891891892, - "grad_norm": 2.6420533657073975, - "learning_rate": 4.767461057728763e-06, - "loss": 0.4923, - "step": 257 - }, - { - "epoch": 1.3945945945945946, - "grad_norm": 3.3152263164520264, - "learning_rate": 4.76566978074987e-06, - "loss": 0.6699, - "step": 258 - }, - { - "epoch": 1.4, - "grad_norm": 2.6928882598876953, - "learning_rate": 4.7638719701671586e-06, - "loss": 0.6117, - "step": 259 - }, - { - "epoch": 1.4054054054054055, - "grad_norm": 2.706597328186035, - "learning_rate": 4.762067631165049e-06, - "loss": 0.8534, - "step": 260 - }, - { - "epoch": 1.4108108108108108, - "grad_norm": 2.9912848472595215, - "learning_rate": 4.760256768946787e-06, - "loss": 0.5057, - "step": 261 - }, - { - "epoch": 1.4162162162162162, - "grad_norm": 2.7098443508148193, - "learning_rate": 4.758439388734429e-06, - "loss": 0.7286, - "step": 262 - }, - { - "epoch": 1.4216216216216218, - "grad_norm": 3.1288092136383057, - "learning_rate": 4.7566154957688276e-06, - "loss": 0.9827, - "step": 263 - }, - { - "epoch": 1.427027027027027, - "grad_norm": 3.0505919456481934, - "learning_rate": 4.754785095309617e-06, - "loss": 0.7042, - "step": 264 - }, - { - "epoch": 1.4324324324324325, - "grad_norm": 2.6800339221954346, - "learning_rate": 4.752948192635199e-06, - "loss": 0.5179, - "step": 265 - }, - { - "epoch": 1.4378378378378378, - "grad_norm": 2.2246861457824707, - "learning_rate": 4.751104793042722e-06, - "loss": 0.8527, - "step": 266 - }, - { - "epoch": 1.4432432432432432, - "grad_norm": 2.4242751598358154, - "learning_rate": 4.7492549018480725e-06, - "loss": 0.5627, - "step": 267 - }, - { - "epoch": 1.4486486486486487, - "grad_norm": 2.763244152069092, - "learning_rate": 4.747398524385858e-06, - "loss": 0.8981, - "step": 268 - }, - { - "epoch": 1.454054054054054, - "grad_norm": 2.856595993041992, - "learning_rate": 4.745535666009389e-06, - "loss": 0.5455, - "step": 269 - }, - { - "epoch": 1.4594594594594594, - "grad_norm": 2.4168624877929688, - "learning_rate": 4.743666332090664e-06, - "loss": 0.4348, - "step": 270 - }, - { - "epoch": 1.464864864864865, - "grad_norm": 2.5408060550689697, - "learning_rate": 4.74179052802036e-06, - "loss": 0.5524, - "step": 271 - }, - { - "epoch": 1.4702702702702704, - "grad_norm": 2.6216673851013184, - "learning_rate": 4.739908259207807e-06, - "loss": 0.7469, - "step": 272 - }, - { - "epoch": 1.4756756756756757, - "grad_norm": 5.397300720214844, - "learning_rate": 4.738019531080981e-06, - "loss": 0.7216, - "step": 273 - }, - { - "epoch": 1.481081081081081, - "grad_norm": 3.3481080532073975, - "learning_rate": 4.7361243490864825e-06, - "loss": 0.7527, - "step": 274 - }, - { - "epoch": 1.4864864864864864, - "grad_norm": 2.7943873405456543, - "learning_rate": 4.734222718689527e-06, - "loss": 0.7437, - "step": 275 - }, - { - "epoch": 1.491891891891892, - "grad_norm": 2.206890344619751, - "learning_rate": 4.732314645373922e-06, - "loss": 0.5187, - "step": 276 - }, - { - "epoch": 1.4972972972972973, - "grad_norm": 2.76442813873291, - "learning_rate": 4.730400134642055e-06, - "loss": 0.7186, - "step": 277 - }, - { - "epoch": 1.5027027027027027, - "grad_norm": 3.4754087924957275, - "learning_rate": 4.728479192014879e-06, - "loss": 0.9655, - "step": 278 - }, - { - "epoch": 1.5081081081081082, - "grad_norm": 2.923779249191284, - "learning_rate": 4.726551823031895e-06, - "loss": 0.6251, - "step": 279 - }, - { - "epoch": 1.5135135135135136, - "grad_norm": 3.1142773628234863, - "learning_rate": 4.7246180332511335e-06, - "loss": 0.4805, - "step": 280 - }, - { - "epoch": 1.518918918918919, - "grad_norm": 2.3477070331573486, - "learning_rate": 4.722677828249142e-06, - "loss": 1.0939, - "step": 281 - }, - { - "epoch": 1.5243243243243243, - "grad_norm": 2.8418569564819336, - "learning_rate": 4.720731213620972e-06, - "loss": 0.9485, - "step": 282 - }, - { - "epoch": 1.5297297297297296, - "grad_norm": 2.462710380554199, - "learning_rate": 4.718778194980152e-06, - "loss": 0.5805, - "step": 283 - }, - { - "epoch": 1.535135135135135, - "grad_norm": 3.2379209995269775, - "learning_rate": 4.7168187779586805e-06, - "loss": 0.77, - "step": 284 - }, - { - "epoch": 1.5405405405405406, - "grad_norm": 3.0701661109924316, - "learning_rate": 4.71485296820701e-06, - "loss": 0.5932, - "step": 285 - }, - { - "epoch": 1.545945945945946, - "grad_norm": 4.099547386169434, - "learning_rate": 4.7128807713940245e-06, - "loss": 0.6296, - "step": 286 - }, - { - "epoch": 1.5513513513513515, - "grad_norm": 2.5529167652130127, - "learning_rate": 4.710902193207028e-06, - "loss": 0.6201, - "step": 287 - }, - { - "epoch": 1.5567567567567568, - "grad_norm": 2.794926881790161, - "learning_rate": 4.708917239351727e-06, - "loss": 0.5682, - "step": 288 - }, - { - "epoch": 1.5621621621621622, - "grad_norm": 3.2522501945495605, - "learning_rate": 4.706925915552214e-06, - "loss": 0.8877, - "step": 289 - }, - { - "epoch": 1.5675675675675675, - "grad_norm": 2.811847448348999, - "learning_rate": 4.704928227550949e-06, - "loss": 0.6521, - "step": 290 - }, - { - "epoch": 1.572972972972973, - "grad_norm": 2.7060673236846924, - "learning_rate": 4.702924181108745e-06, - "loss": 0.4929, - "step": 291 - }, - { - "epoch": 1.5783783783783782, - "grad_norm": 2.5009031295776367, - "learning_rate": 4.700913782004755e-06, - "loss": 0.4515, - "step": 292 - }, - { - "epoch": 1.5837837837837838, - "grad_norm": 2.6722700595855713, - "learning_rate": 4.698897036036446e-06, - "loss": 0.5477, - "step": 293 - }, - { - "epoch": 1.5891891891891892, - "grad_norm": 3.3333957195281982, - "learning_rate": 4.696873949019591e-06, - "loss": 0.9589, - "step": 294 - }, - { - "epoch": 1.5945945945945947, - "grad_norm": 2.4862897396087646, - "learning_rate": 4.694844526788248e-06, - "loss": 0.4425, - "step": 295 - }, - { - "epoch": 1.6, - "grad_norm": 2.78708553314209, - "learning_rate": 4.692808775194745e-06, - "loss": 0.4899, - "step": 296 - }, - { - "epoch": 1.6054054054054054, - "grad_norm": 2.9121289253234863, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4884, - "step": 297 - }, - { - "epoch": 1.6108108108108108, - "grad_norm": 4.692054271697998, - "learning_rate": 4.688718307421807e-06, - "loss": 0.8977, - "step": 298 - }, - { - "epoch": 1.6162162162162161, - "grad_norm": 3.1290926933288574, - "learning_rate": 4.686663603038222e-06, - "loss": 0.6833, - "step": 299 - }, - { - "epoch": 1.6216216216216215, - "grad_norm": 3.5091123580932617, - "learning_rate": 4.6846025928841365e-06, - "loss": 0.9141, - "step": 300 - }, - { - "epoch": 1.627027027027027, - "grad_norm": 2.5466184616088867, - "learning_rate": 4.6825352829029705e-06, - "loss": 0.5121, - "step": 301 - }, - { - "epoch": 1.6324324324324324, - "grad_norm": 2.7833092212677, - "learning_rate": 4.68046167905631e-06, - "loss": 0.5399, - "step": 302 - }, - { - "epoch": 1.637837837837838, - "grad_norm": 3.05135440826416, - "learning_rate": 4.678381787323889e-06, - "loss": 0.7921, - "step": 303 - }, - { - "epoch": 1.6432432432432433, - "grad_norm": 2.2391726970672607, - "learning_rate": 4.676295613703577e-06, - "loss": 0.7178, - "step": 304 - }, - { - "epoch": 1.6486486486486487, - "grad_norm": 2.3654022216796875, - "learning_rate": 4.674203164211357e-06, - "loss": 0.7162, - "step": 305 - }, - { - "epoch": 1.654054054054054, - "grad_norm": 2.436009645462036, - "learning_rate": 4.67210444488131e-06, - "loss": 0.6539, - "step": 306 - }, - { - "epoch": 1.6594594594594594, - "grad_norm": 2.6034209728240967, - "learning_rate": 4.669999461765599e-06, - "loss": 0.7214, - "step": 307 - }, - { - "epoch": 1.6648648648648647, - "grad_norm": 2.804229497909546, - "learning_rate": 4.6678882209344474e-06, - "loss": 0.7451, - "step": 308 - }, - { - "epoch": 1.6702702702702703, - "grad_norm": 2.6239655017852783, - "learning_rate": 4.665770728476127e-06, - "loss": 0.6464, - "step": 309 - }, - { - "epoch": 1.6756756756756757, - "grad_norm": 2.9320099353790283, - "learning_rate": 4.663646990496939e-06, - "loss": 0.6669, - "step": 310 - }, - { - "epoch": 1.6810810810810812, - "grad_norm": 3.09713077545166, - "learning_rate": 4.661517013121189e-06, - "loss": 0.8972, - "step": 311 - }, - { - "epoch": 1.6864864864864866, - "grad_norm": 3.6576132774353027, - "learning_rate": 4.659380802491181e-06, - "loss": 0.6286, - "step": 312 - }, - { - "epoch": 1.691891891891892, - "grad_norm": 2.9320433139801025, - "learning_rate": 4.6572383647671915e-06, - "loss": 0.3631, - "step": 313 - }, - { - "epoch": 1.6972972972972973, - "grad_norm": 3.399357557296753, - "learning_rate": 4.655089706127457e-06, - "loss": 0.5682, - "step": 314 - }, - { - "epoch": 1.7027027027027026, - "grad_norm": 2.7667412757873535, - "learning_rate": 4.652934832768148e-06, - "loss": 0.5457, - "step": 315 - }, - { - "epoch": 1.708108108108108, - "grad_norm": 2.3023321628570557, - "learning_rate": 4.650773750903363e-06, - "loss": 0.6601, - "step": 316 - }, - { - "epoch": 1.7135135135135136, - "grad_norm": 2.6584670543670654, - "learning_rate": 4.6486064667651005e-06, - "loss": 0.5882, - "step": 317 - }, - { - "epoch": 1.718918918918919, - "grad_norm": 5.528168678283691, - "learning_rate": 4.646432986603245e-06, - "loss": 0.7628, - "step": 318 - }, - { - "epoch": 1.7243243243243245, - "grad_norm": 3.054884195327759, - "learning_rate": 4.644253316685552e-06, - "loss": 0.6877, - "step": 319 - }, - { - "epoch": 1.7297297297297298, - "grad_norm": 3.2672388553619385, - "learning_rate": 4.6420674632976205e-06, - "loss": 0.7026, - "step": 320 - }, - { - "epoch": 1.7351351351351352, - "grad_norm": 3.109384536743164, - "learning_rate": 4.639875432742886e-06, - "loss": 0.5236, - "step": 321 - }, - { - "epoch": 1.7405405405405405, - "grad_norm": 3.3593883514404297, - "learning_rate": 4.6376772313425975e-06, - "loss": 0.6463, - "step": 322 - }, - { - "epoch": 1.7459459459459459, - "grad_norm": 2.6352698802948, - "learning_rate": 4.635472865435795e-06, - "loss": 0.6903, - "step": 323 - }, - { - "epoch": 1.7513513513513512, - "grad_norm": 2.751690149307251, - "learning_rate": 4.6332623413792995e-06, - "loss": 0.7342, - "step": 324 - }, - { - "epoch": 1.7567567567567568, - "grad_norm": 2.670915126800537, - "learning_rate": 4.6310456655476874e-06, - "loss": 0.4302, - "step": 325 - }, - { - "epoch": 1.7621621621621621, - "grad_norm": 2.7648138999938965, - "learning_rate": 4.6288228443332786e-06, - "loss": 0.5108, - "step": 326 - }, - { - "epoch": 1.7675675675675677, - "grad_norm": 2.7451536655426025, - "learning_rate": 4.626593884146111e-06, - "loss": 0.7646, - "step": 327 - }, - { - "epoch": 1.772972972972973, - "grad_norm": 2.4656403064727783, - "learning_rate": 4.624358791413928e-06, - "loss": 0.5529, - "step": 328 - }, - { - "epoch": 1.7783783783783784, - "grad_norm": 2.5987517833709717, - "learning_rate": 4.622117572582159e-06, - "loss": 0.609, - "step": 329 - }, - { - "epoch": 1.7837837837837838, - "grad_norm": 3.3843371868133545, - "learning_rate": 4.619870234113894e-06, - "loss": 0.9146, - "step": 330 - }, - { - "epoch": 1.7891891891891891, - "grad_norm": 2.3542068004608154, - "learning_rate": 4.617616782489878e-06, - "loss": 0.6887, - "step": 331 - }, - { - "epoch": 1.7945945945945945, - "grad_norm": 2.2049715518951416, - "learning_rate": 4.615357224208477e-06, - "loss": 0.505, - "step": 332 - }, - { - "epoch": 1.8, - "grad_norm": 2.453920364379883, - "learning_rate": 4.613091565785674e-06, - "loss": 0.8384, - "step": 333 - }, - { - "epoch": 1.8054054054054054, - "grad_norm": 2.5751583576202393, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5512, - "step": 334 - }, - { - "epoch": 1.810810810810811, - "grad_norm": 2.524075984954834, - "learning_rate": 4.608541974667714e-06, - "loss": 0.4877, - "step": 335 - }, - { - "epoch": 1.8162162162162163, - "grad_norm": 2.2856955528259277, - "learning_rate": 4.606258055092397e-06, - "loss": 0.5583, - "step": 336 - }, - { - "epoch": 1.8216216216216217, - "grad_norm": 2.2773683071136475, - "learning_rate": 4.603968061615321e-06, - "loss": 0.5421, - "step": 337 - }, - { - "epoch": 1.827027027027027, - "grad_norm": 4.085512161254883, - "learning_rate": 4.601672000840231e-06, - "loss": 0.942, - "step": 338 - }, - { - "epoch": 1.8324324324324324, - "grad_norm": 2.3710968494415283, - "learning_rate": 4.5993698793883715e-06, - "loss": 0.3773, - "step": 339 - }, - { - "epoch": 1.8378378378378377, - "grad_norm": 2.745534658432007, - "learning_rate": 4.597061703898462e-06, - "loss": 0.9694, - "step": 340 - }, - { - "epoch": 1.8432432432432433, - "grad_norm": 2.463207244873047, - "learning_rate": 4.594747481026685e-06, - "loss": 0.4667, - "step": 341 - }, - { - "epoch": 1.8486486486486486, - "grad_norm": 2.7216601371765137, - "learning_rate": 4.592427217446656e-06, - "loss": 0.4267, - "step": 342 - }, - { - "epoch": 1.8540540540540542, - "grad_norm": 2.545664072036743, - "learning_rate": 4.590100919849413e-06, - "loss": 0.9245, - "step": 343 - }, - { - "epoch": 1.8594594594594596, - "grad_norm": 3.692840337753296, - "learning_rate": 4.587768594943396e-06, - "loss": 0.7502, - "step": 344 - }, - { - "epoch": 1.864864864864865, - "grad_norm": 2.993229627609253, - "learning_rate": 4.585430249454426e-06, - "loss": 0.4689, - "step": 345 - }, - { - "epoch": 1.8702702702702703, - "grad_norm": 2.162867546081543, - "learning_rate": 4.583085890125682e-06, - "loss": 0.6188, - "step": 346 - }, - { - "epoch": 1.8756756756756756, - "grad_norm": 2.2169792652130127, - "learning_rate": 4.5807355237176896e-06, - "loss": 0.6352, - "step": 347 - }, - { - "epoch": 1.881081081081081, - "grad_norm": 3.978985548019409, - "learning_rate": 4.578379157008296e-06, - "loss": 0.464, - "step": 348 - }, - { - "epoch": 1.8864864864864865, - "grad_norm": 2.236682653427124, - "learning_rate": 4.57601679679265e-06, - "loss": 0.5943, - "step": 349 - }, - { - "epoch": 1.8918918918918919, - "grad_norm": 2.528754472732544, - "learning_rate": 4.573648449883188e-06, - "loss": 0.6949, - "step": 350 - }, - { - "epoch": 1.8972972972972975, - "grad_norm": 2.7673721313476562, - "learning_rate": 4.571274123109606e-06, - "loss": 0.4333, - "step": 351 - }, - { - "epoch": 1.9027027027027028, - "grad_norm": 2.698012351989746, - "learning_rate": 4.568893823318847e-06, - "loss": 0.6796, - "step": 352 - }, - { - "epoch": 1.9081081081081082, - "grad_norm": 2.9640560150146484, - "learning_rate": 4.566507557375077e-06, - "loss": 0.6139, - "step": 353 - }, - { - "epoch": 1.9135135135135135, - "grad_norm": 2.417628526687622, - "learning_rate": 4.5641153321596684e-06, - "loss": 0.4515, - "step": 354 - }, - { - "epoch": 1.9189189189189189, - "grad_norm": 2.676739454269409, - "learning_rate": 4.56171715457118e-06, - "loss": 0.8426, - "step": 355 - }, - { - "epoch": 1.9243243243243242, - "grad_norm": 2.8428189754486084, - "learning_rate": 4.559313031525331e-06, - "loss": 0.5806, - "step": 356 - }, - { - "epoch": 1.9297297297297298, - "grad_norm": 2.6817944049835205, - "learning_rate": 4.55690296995499e-06, - "loss": 0.5927, - "step": 357 - }, - { - "epoch": 1.9351351351351351, - "grad_norm": 3.5939931869506836, - "learning_rate": 4.554486976810149e-06, - "loss": 0.9986, - "step": 358 - }, - { - "epoch": 1.9405405405405407, - "grad_norm": 2.86688494682312, - "learning_rate": 4.552065059057906e-06, - "loss": 0.6813, - "step": 359 - }, - { - "epoch": 1.945945945945946, - "grad_norm": 2.9295246601104736, - "learning_rate": 4.549637223682441e-06, - "loss": 1.0832, - "step": 360 - }, - { - "epoch": 1.9513513513513514, - "grad_norm": 2.6939451694488525, - "learning_rate": 4.547203477685005e-06, - "loss": 0.7377, - "step": 361 - }, - { - "epoch": 1.9567567567567568, - "grad_norm": 2.226055145263672, - "learning_rate": 4.544763828083888e-06, - "loss": 0.5412, - "step": 362 - }, - { - "epoch": 1.962162162162162, - "grad_norm": 2.490187406539917, - "learning_rate": 4.542318281914405e-06, - "loss": 0.6955, - "step": 363 - }, - { - "epoch": 1.9675675675675675, - "grad_norm": 2.9241302013397217, - "learning_rate": 4.53986684622888e-06, - "loss": 0.6774, - "step": 364 - }, - { - "epoch": 1.972972972972973, - "grad_norm": 2.988084554672241, - "learning_rate": 4.537409528096615e-06, - "loss": 0.5832, - "step": 365 - }, - { - "epoch": 1.9783783783783784, - "grad_norm": 2.9380626678466797, - "learning_rate": 4.534946334603879e-06, - "loss": 0.606, - "step": 366 - }, - { - "epoch": 1.983783783783784, - "grad_norm": 2.667588710784912, - "learning_rate": 4.532477272853882e-06, - "loss": 0.4991, - "step": 367 - }, - { - "epoch": 1.9891891891891893, - "grad_norm": 2.9711899757385254, - "learning_rate": 4.530002349966759e-06, - "loss": 0.4442, - "step": 368 - }, - { - "epoch": 1.9945945945945946, - "grad_norm": 3.443957805633545, - "learning_rate": 4.5275215730795445e-06, - "loss": 0.6566, - "step": 369 - }, - { - "epoch": 2.0, - "grad_norm": 3.590317487716675, - "learning_rate": 4.525034949346156e-06, - "loss": 0.5687, - "step": 370 - }, - { - "epoch": 2.0054054054054054, - "grad_norm": 3.678600549697876, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4458, - "step": 371 - }, - { - "epoch": 2.0108108108108107, - "grad_norm": 3.803563356399536, - "learning_rate": 4.5200441900408045e-06, - "loss": 0.4418, - "step": 372 - }, - { - "epoch": 2.016216216216216, - "grad_norm": 2.9187233448028564, - "learning_rate": 4.517540068860898e-06, - "loss": 0.7057, - "step": 373 - }, - { - "epoch": 2.0216216216216214, - "grad_norm": 2.693603515625, - "learning_rate": 4.515030129618884e-06, - "loss": 0.4491, - "step": 374 - }, - { - "epoch": 2.027027027027027, - "grad_norm": 2.3883047103881836, - "learning_rate": 4.512514379552779e-06, - "loss": 0.3571, - "step": 375 - }, - { - "epoch": 2.0324324324324325, - "grad_norm": 4.558557033538818, - "learning_rate": 4.509992825917352e-06, - "loss": 0.5056, - "step": 376 - }, - { - "epoch": 2.037837837837838, - "grad_norm": 3.9574761390686035, - "learning_rate": 4.507465475984109e-06, - "loss": 0.6834, - "step": 377 - }, - { - "epoch": 2.0432432432432432, - "grad_norm": 5.34630012512207, - "learning_rate": 4.504932337041272e-06, - "loss": 0.6726, - "step": 378 - }, - { - "epoch": 2.0486486486486486, - "grad_norm": 3.198740243911743, - "learning_rate": 4.502393416393757e-06, - "loss": 0.4032, - "step": 379 - }, - { - "epoch": 2.054054054054054, - "grad_norm": 3.347480297088623, - "learning_rate": 4.4998487213631515e-06, - "loss": 0.5442, - "step": 380 - }, - { - "epoch": 2.0594594594594593, - "grad_norm": 3.940531015396118, - "learning_rate": 4.497298259287696e-06, - "loss": 0.6181, - "step": 381 - }, - { - "epoch": 2.064864864864865, - "grad_norm": 3.0910496711730957, - "learning_rate": 4.494742037522261e-06, - "loss": 0.3829, - "step": 382 - }, - { - "epoch": 2.0702702702702704, - "grad_norm": 4.060451984405518, - "learning_rate": 4.4921800634383295e-06, - "loss": 0.4953, - "step": 383 - }, - { - "epoch": 2.075675675675676, - "grad_norm": 3.1667511463165283, - "learning_rate": 4.4896123444239655e-06, - "loss": 0.3254, - "step": 384 - }, - { - "epoch": 2.081081081081081, - "grad_norm": 3.0239670276641846, - "learning_rate": 4.487038887883809e-06, - "loss": 0.555, - "step": 385 - }, - { - "epoch": 2.0864864864864865, - "grad_norm": 2.8815383911132812, - "learning_rate": 4.484459701239038e-06, - "loss": 0.665, - "step": 386 - }, - { - "epoch": 2.091891891891892, - "grad_norm": 3.615537166595459, - "learning_rate": 4.481874791927358e-06, - "loss": 0.2652, - "step": 387 - }, - { - "epoch": 2.097297297297297, - "grad_norm": 3.407407283782959, - "learning_rate": 4.479284167402977e-06, - "loss": 0.3811, - "step": 388 - }, - { - "epoch": 2.1027027027027025, - "grad_norm": 2.6651623249053955, - "learning_rate": 4.476687835136585e-06, - "loss": 0.2463, - "step": 389 - }, - { - "epoch": 2.108108108108108, - "grad_norm": 3.5145862102508545, - "learning_rate": 4.47408580261533e-06, - "loss": 0.5507, - "step": 390 - }, - { - "epoch": 2.1135135135135137, - "grad_norm": 3.0952725410461426, - "learning_rate": 4.471478077342798e-06, - "loss": 0.288, - "step": 391 - }, - { - "epoch": 2.118918918918919, - "grad_norm": 2.634775400161743, - "learning_rate": 4.468864666838994e-06, - "loss": 0.5169, - "step": 392 - }, - { - "epoch": 2.1243243243243244, - "grad_norm": 3.7388594150543213, - "learning_rate": 4.4662455786403125e-06, - "loss": 0.3327, - "step": 393 - }, - { - "epoch": 2.1297297297297297, - "grad_norm": 3.8197360038757324, - "learning_rate": 4.463620820299528e-06, - "loss": 0.3877, - "step": 394 - }, - { - "epoch": 2.135135135135135, - "grad_norm": 3.0073485374450684, - "learning_rate": 4.4609903993857606e-06, - "loss": 0.5425, - "step": 395 - }, - { - "epoch": 2.1405405405405404, - "grad_norm": 2.6923868656158447, - "learning_rate": 4.458354323484462e-06, - "loss": 0.5257, - "step": 396 - }, - { - "epoch": 2.145945945945946, - "grad_norm": 3.2151331901550293, - "learning_rate": 4.45571260019739e-06, - "loss": 0.3914, - "step": 397 - }, - { - "epoch": 2.1513513513513516, - "grad_norm": 3.4031248092651367, - "learning_rate": 4.453065237142592e-06, - "loss": 0.3455, - "step": 398 - }, - { - "epoch": 2.156756756756757, - "grad_norm": 3.012275457382202, - "learning_rate": 4.4504122419543745e-06, - "loss": 0.4652, - "step": 399 - }, - { - "epoch": 2.1621621621621623, - "grad_norm": 3.3084208965301514, - "learning_rate": 4.4477536222832865e-06, - "loss": 0.6343, - "step": 400 - }, - { - "epoch": 2.1675675675675676, - "grad_norm": 3.115206241607666, - "learning_rate": 4.445089385796099e-06, - "loss": 0.6975, - "step": 401 - }, - { - "epoch": 2.172972972972973, - "grad_norm": 2.893930435180664, - "learning_rate": 4.442419540175778e-06, - "loss": 0.5779, - "step": 402 - }, - { - "epoch": 2.1783783783783783, - "grad_norm": 3.0549168586730957, - "learning_rate": 4.439744093121465e-06, - "loss": 0.4541, - "step": 403 - }, - { - "epoch": 2.1837837837837837, - "grad_norm": 3.1189024448394775, - "learning_rate": 4.437063052348457e-06, - "loss": 0.4078, - "step": 404 - }, - { - "epoch": 2.189189189189189, - "grad_norm": 6.644659042358398, - "learning_rate": 4.434376425588179e-06, - "loss": 0.6759, - "step": 405 - }, - { - "epoch": 2.1945945945945944, - "grad_norm": 2.807554006576538, - "learning_rate": 4.431684220588163e-06, - "loss": 0.2938, - "step": 406 - }, - { - "epoch": 2.2, - "grad_norm": 3.6900999546051025, - "learning_rate": 4.428986445112034e-06, - "loss": 0.676, - "step": 407 - }, - { - "epoch": 2.2054054054054055, - "grad_norm": 2.0721664428710938, - "learning_rate": 4.426283106939474e-06, - "loss": 0.1859, - "step": 408 - }, - { - "epoch": 2.210810810810811, - "grad_norm": 2.953388214111328, - "learning_rate": 4.423574213866209e-06, - "loss": 0.2955, - "step": 409 - }, - { - "epoch": 2.2162162162162162, - "grad_norm": 3.049050807952881, - "learning_rate": 4.420859773703985e-06, - "loss": 0.2262, - "step": 410 - }, - { - "epoch": 2.2216216216216216, - "grad_norm": 3.319796323776245, - "learning_rate": 4.418139794280542e-06, - "loss": 0.2273, - "step": 411 - }, - { - "epoch": 2.227027027027027, - "grad_norm": 2.4133522510528564, - "learning_rate": 4.415414283439595e-06, - "loss": 0.3282, - "step": 412 - }, - { - "epoch": 2.2324324324324323, - "grad_norm": 2.9842193126678467, - "learning_rate": 4.4126832490408116e-06, - "loss": 0.3651, - "step": 413 - }, - { - "epoch": 2.237837837837838, - "grad_norm": 2.759531259536743, - "learning_rate": 4.409946698959784e-06, - "loss": 0.4052, - "step": 414 - }, - { - "epoch": 2.2432432432432434, - "grad_norm": 3.045485019683838, - "learning_rate": 4.4072046410880145e-06, - "loss": 0.4638, - "step": 415 - }, - { - "epoch": 2.2486486486486488, - "grad_norm": 3.0058295726776123, - "learning_rate": 4.404457083332887e-06, - "loss": 0.517, - "step": 416 - }, - { - "epoch": 2.254054054054054, - "grad_norm": 3.025688409805298, - "learning_rate": 4.401704033617643e-06, - "loss": 0.6902, - "step": 417 - }, - { - "epoch": 2.2594594594594595, - "grad_norm": 3.3047802448272705, - "learning_rate": 4.398945499881366e-06, - "loss": 0.3552, - "step": 418 - }, - { - "epoch": 2.264864864864865, - "grad_norm": 3.0683655738830566, - "learning_rate": 4.396181490078949e-06, - "loss": 0.286, - "step": 419 - }, - { - "epoch": 2.27027027027027, - "grad_norm": 3.627681016921997, - "learning_rate": 4.393412012181082e-06, - "loss": 0.4036, - "step": 420 - }, - { - "epoch": 2.2756756756756755, - "grad_norm": 4.552238941192627, - "learning_rate": 4.390637074174219e-06, - "loss": 0.8037, - "step": 421 - }, - { - "epoch": 2.281081081081081, - "grad_norm": 2.8688855171203613, - "learning_rate": 4.387856684060561e-06, - "loss": 0.2553, - "step": 422 - }, - { - "epoch": 2.2864864864864867, - "grad_norm": 4.21850061416626, - "learning_rate": 4.385070849858033e-06, - "loss": 0.6222, - "step": 423 - }, - { - "epoch": 2.291891891891892, - "grad_norm": 3.038433790206909, - "learning_rate": 4.382279579600257e-06, - "loss": 0.5326, - "step": 424 - }, - { - "epoch": 2.2972972972972974, - "grad_norm": 3.297300338745117, - "learning_rate": 4.379482881336532e-06, - "loss": 0.5515, - "step": 425 - }, - { - "epoch": 2.3027027027027027, - "grad_norm": 7.162952423095703, - "learning_rate": 4.376680763131811e-06, - "loss": 0.6948, - "step": 426 - }, - { - "epoch": 2.308108108108108, - "grad_norm": 3.2403595447540283, - "learning_rate": 4.373873233066676e-06, - "loss": 0.2947, - "step": 427 - }, - { - "epoch": 2.3135135135135134, - "grad_norm": 3.2969906330108643, - "learning_rate": 4.371060299237315e-06, - "loss": 0.2261, - "step": 428 - }, - { - "epoch": 2.3189189189189188, - "grad_norm": 2.669058322906494, - "learning_rate": 4.368241969755499e-06, - "loss": 0.5398, - "step": 429 - }, - { - "epoch": 2.3243243243243246, - "grad_norm": 2.7643518447875977, - "learning_rate": 4.36541825274856e-06, - "loss": 0.3301, - "step": 430 - }, - { - "epoch": 2.32972972972973, - "grad_norm": 3.6037657260894775, - "learning_rate": 4.3625891563593635e-06, - "loss": 0.6064, - "step": 431 - }, - { - "epoch": 2.3351351351351353, - "grad_norm": 2.8805618286132812, - "learning_rate": 4.35975468874629e-06, - "loss": 0.3897, - "step": 432 - }, - { - "epoch": 2.3405405405405406, - "grad_norm": 2.642402172088623, - "learning_rate": 4.356914858083211e-06, - "loss": 0.271, - "step": 433 - }, - { - "epoch": 2.345945945945946, - "grad_norm": 2.916337490081787, - "learning_rate": 4.354069672559458e-06, - "loss": 0.3681, - "step": 434 - }, - { - "epoch": 2.3513513513513513, - "grad_norm": 3.3312325477600098, - "learning_rate": 4.35121914037981e-06, - "loss": 0.298, - "step": 435 - }, - { - "epoch": 2.3567567567567567, - "grad_norm": 2.980583906173706, - "learning_rate": 4.348363269764462e-06, - "loss": 0.3618, - "step": 436 - }, - { - "epoch": 2.362162162162162, - "grad_norm": 3.5010197162628174, - "learning_rate": 4.345502068949003e-06, - "loss": 0.8972, - "step": 437 - }, - { - "epoch": 2.3675675675675674, - "grad_norm": 2.7187814712524414, - "learning_rate": 4.342635546184394e-06, - "loss": 0.3939, - "step": 438 - }, - { - "epoch": 2.372972972972973, - "grad_norm": 2.8368170261383057, - "learning_rate": 4.339763709736944e-06, - "loss": 0.5462, - "step": 439 - }, - { - "epoch": 2.3783783783783785, - "grad_norm": 2.6989636421203613, - "learning_rate": 4.336886567888283e-06, - "loss": 0.5932, - "step": 440 - }, - { - "epoch": 2.383783783783784, - "grad_norm": 3.2514829635620117, - "learning_rate": 4.334004128935342e-06, - "loss": 0.4622, - "step": 441 - }, - { - "epoch": 2.389189189189189, - "grad_norm": 5.242766857147217, - "learning_rate": 4.331116401190327e-06, - "loss": 0.5997, - "step": 442 - }, - { - "epoch": 2.3945945945945946, - "grad_norm": 3.492724657058716, - "learning_rate": 4.328223392980696e-06, - "loss": 0.3072, - "step": 443 - }, - { - "epoch": 2.4, - "grad_norm": 4.074132442474365, - "learning_rate": 4.325325112649134e-06, - "loss": 0.5338, - "step": 444 - }, - { - "epoch": 2.4054054054054053, - "grad_norm": 2.7208468914031982, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3266, - "step": 445 - }, - { - "epoch": 2.410810810810811, - "grad_norm": 2.929180383682251, - "learning_rate": 4.3195127690669494e-06, - "loss": 0.4064, - "step": 446 - }, - { - "epoch": 2.4162162162162164, - "grad_norm": 2.848353624343872, - "learning_rate": 4.3165987225776186e-06, - "loss": 0.3856, - "step": 447 - }, - { - "epoch": 2.4216216216216218, - "grad_norm": 3.946488618850708, - "learning_rate": 4.313679437488889e-06, - "loss": 0.4261, - "step": 448 - }, - { - "epoch": 2.427027027027027, - "grad_norm": 5.781888961791992, - "learning_rate": 4.310754922219223e-06, - "loss": 0.4943, - "step": 449 - }, - { - "epoch": 2.4324324324324325, - "grad_norm": 2.8406941890716553, - "learning_rate": 4.307825185202164e-06, - "loss": 0.2874, - "step": 450 - }, - { - "epoch": 2.437837837837838, - "grad_norm": 3.2017335891723633, - "learning_rate": 4.3048902348863116e-06, - "loss": 0.4218, - "step": 451 - }, - { - "epoch": 2.443243243243243, - "grad_norm": 3.8355906009674072, - "learning_rate": 4.301950079735303e-06, - "loss": 0.4204, - "step": 452 - }, - { - "epoch": 2.4486486486486485, - "grad_norm": 4.783357620239258, - "learning_rate": 4.299004728227782e-06, - "loss": 0.5593, - "step": 453 - }, - { - "epoch": 2.454054054054054, - "grad_norm": 3.014080762863159, - "learning_rate": 4.2960541888573774e-06, - "loss": 0.4187, - "step": 454 - }, - { - "epoch": 2.4594594594594597, - "grad_norm": 3.5906598567962646, - "learning_rate": 4.29309847013268e-06, - "loss": 0.4193, - "step": 455 - }, - { - "epoch": 2.464864864864865, - "grad_norm": 3.9043331146240234, - "learning_rate": 4.290137580577216e-06, - "loss": 0.7035, - "step": 456 - }, - { - "epoch": 2.4702702702702704, - "grad_norm": 3.139753580093384, - "learning_rate": 4.287171528729423e-06, - "loss": 0.5877, - "step": 457 - }, - { - "epoch": 2.4756756756756757, - "grad_norm": 2.9091074466705322, - "learning_rate": 4.284200323142623e-06, - "loss": 0.5309, - "step": 458 - }, - { - "epoch": 2.481081081081081, - "grad_norm": 3.1253795623779297, - "learning_rate": 4.281223972385004e-06, - "loss": 0.448, - "step": 459 - }, - { - "epoch": 2.4864864864864864, - "grad_norm": 2.65510892868042, - "learning_rate": 4.27824248503959e-06, - "loss": 0.4453, - "step": 460 - }, - { - "epoch": 2.4918918918918918, - "grad_norm": 3.2135510444641113, - "learning_rate": 4.275255869704214e-06, - "loss": 0.5582, - "step": 461 - }, - { - "epoch": 2.4972972972972975, - "grad_norm": 2.452545404434204, - "learning_rate": 4.272264134991503e-06, - "loss": 0.423, - "step": 462 - }, - { - "epoch": 2.5027027027027025, - "grad_norm": 2.6370208263397217, - "learning_rate": 4.269267289528843e-06, - "loss": 0.271, - "step": 463 - }, - { - "epoch": 2.5081081081081082, - "grad_norm": 3.31266450881958, - "learning_rate": 4.266265341958356e-06, - "loss": 0.6459, - "step": 464 - }, - { - "epoch": 2.5135135135135136, - "grad_norm": 3.2743148803710938, - "learning_rate": 4.263258300936882e-06, - "loss": 0.2959, - "step": 465 - }, - { - "epoch": 2.518918918918919, - "grad_norm": 2.883549690246582, - "learning_rate": 4.260246175135948e-06, - "loss": 0.3418, - "step": 466 - }, - { - "epoch": 2.5243243243243243, - "grad_norm": 2.7019498348236084, - "learning_rate": 4.257228973241742e-06, - "loss": 0.3459, - "step": 467 - }, - { - "epoch": 2.5297297297297296, - "grad_norm": 3.8166959285736084, - "learning_rate": 4.254206703955092e-06, - "loss": 0.4769, - "step": 468 - }, - { - "epoch": 2.535135135135135, - "grad_norm": 3.264763593673706, - "learning_rate": 4.251179375991438e-06, - "loss": 0.6487, - "step": 469 - }, - { - "epoch": 2.5405405405405403, - "grad_norm": 2.7936933040618896, - "learning_rate": 4.248146998080808e-06, - "loss": 0.5547, - "step": 470 - }, - { - "epoch": 2.545945945945946, - "grad_norm": 3.21852707862854, - "learning_rate": 4.2451095789677945e-06, - "loss": 0.2965, - "step": 471 - }, - { - "epoch": 2.5513513513513515, - "grad_norm": 3.4528985023498535, - "learning_rate": 4.242067127411525e-06, - "loss": 0.3831, - "step": 472 - }, - { - "epoch": 2.556756756756757, - "grad_norm": 4.317023754119873, - "learning_rate": 4.239019652185642e-06, - "loss": 0.1756, - "step": 473 - }, - { - "epoch": 2.562162162162162, - "grad_norm": 3.677452325820923, - "learning_rate": 4.2359671620782725e-06, - "loss": 0.5136, - "step": 474 - }, - { - "epoch": 2.5675675675675675, - "grad_norm": 3.7563393115997314, - "learning_rate": 4.232909665892005e-06, - "loss": 0.6554, - "step": 475 - }, - { - "epoch": 2.572972972972973, - "grad_norm": 3.5125508308410645, - "learning_rate": 4.229847172443866e-06, - "loss": 0.3804, - "step": 476 - }, - { - "epoch": 2.5783783783783782, - "grad_norm": 2.8835806846618652, - "learning_rate": 4.2267796905652926e-06, - "loss": 0.3338, - "step": 477 - }, - { - "epoch": 2.583783783783784, - "grad_norm": 3.2136261463165283, - "learning_rate": 4.223707229102105e-06, - "loss": 0.6163, - "step": 478 - }, - { - "epoch": 2.589189189189189, - "grad_norm": 3.467475175857544, - "learning_rate": 4.220629796914487e-06, - "loss": 0.3005, - "step": 479 - }, - { - "epoch": 2.5945945945945947, - "grad_norm": 3.597490072250366, - "learning_rate": 4.217547402876954e-06, - "loss": 0.56, - "step": 480 - }, - { - "epoch": 2.6, - "grad_norm": 3.2377140522003174, - "learning_rate": 4.214460055878329e-06, - "loss": 0.4512, - "step": 481 - }, - { - "epoch": 2.6054054054054054, - "grad_norm": 2.577746868133545, - "learning_rate": 4.211367764821722e-06, - "loss": 0.3074, - "step": 482 - }, - { - "epoch": 2.610810810810811, - "grad_norm": 3.6584155559539795, - "learning_rate": 4.208270538624497e-06, - "loss": 0.6752, - "step": 483 - }, - { - "epoch": 2.616216216216216, - "grad_norm": 2.602778434753418, - "learning_rate": 4.205168386218251e-06, - "loss": 0.2347, - "step": 484 - }, - { - "epoch": 2.6216216216216215, - "grad_norm": 3.587503433227539, - "learning_rate": 4.2020613165487865e-06, - "loss": 0.5189, - "step": 485 - }, - { - "epoch": 2.627027027027027, - "grad_norm": 3.9341986179351807, - "learning_rate": 4.198949338576086e-06, - "loss": 0.7739, - "step": 486 - }, - { - "epoch": 2.6324324324324326, - "grad_norm": 2.9211957454681396, - "learning_rate": 4.1958324612742875e-06, - "loss": 0.3495, - "step": 487 - }, - { - "epoch": 2.637837837837838, - "grad_norm": 3.29193115234375, - "learning_rate": 4.1927106936316564e-06, - "loss": 0.2257, - "step": 488 - }, - { - "epoch": 2.6432432432432433, - "grad_norm": 3.3687057495117188, - "learning_rate": 4.189584044650559e-06, - "loss": 0.6708, - "step": 489 - }, - { - "epoch": 2.6486486486486487, - "grad_norm": 3.096428155899048, - "learning_rate": 4.186452523347441e-06, - "loss": 0.3126, - "step": 490 - }, - { - "epoch": 2.654054054054054, - "grad_norm": 3.0865559577941895, - "learning_rate": 4.183316138752799e-06, - "loss": 0.4219, - "step": 491 - }, - { - "epoch": 2.6594594594594594, - "grad_norm": 3.389827013015747, - "learning_rate": 4.180174899911149e-06, - "loss": 0.3937, - "step": 492 - }, - { - "epoch": 2.6648648648648647, - "grad_norm": 3.044360637664795, - "learning_rate": 4.177028815881012e-06, - "loss": 0.4098, - "step": 493 - }, - { - "epoch": 2.6702702702702705, - "grad_norm": 2.813094139099121, - "learning_rate": 4.173877895734875e-06, - "loss": 0.3597, - "step": 494 - }, - { - "epoch": 2.6756756756756754, - "grad_norm": 2.4037158489227295, - "learning_rate": 4.1707221485591764e-06, - "loss": 0.3284, - "step": 495 - }, - { - "epoch": 2.6810810810810812, - "grad_norm": 3.049436092376709, - "learning_rate": 4.167561583454272e-06, - "loss": 0.257, - "step": 496 - }, - { - "epoch": 2.6864864864864866, - "grad_norm": 3.458923816680908, - "learning_rate": 4.164396209534411e-06, - "loss": 0.1819, - "step": 497 - }, - { - "epoch": 2.691891891891892, - "grad_norm": 3.3084232807159424, - "learning_rate": 4.161226035927711e-06, - "loss": 0.7109, - "step": 498 - }, - { - "epoch": 2.6972972972972973, - "grad_norm": 3.034550189971924, - "learning_rate": 4.15805107177613e-06, - "loss": 0.6297, - "step": 499 - }, - { - "epoch": 2.7027027027027026, - "grad_norm": 3.5786449909210205, - "learning_rate": 4.15487132623544e-06, - "loss": 0.5195, - "step": 500 - }, - { - "epoch": 2.708108108108108, - "grad_norm": 3.4477646350860596, - "learning_rate": 4.151686808475204e-06, - "loss": 0.2528, - "step": 501 - }, - { - "epoch": 2.7135135135135133, - "grad_norm": 3.0256869792938232, - "learning_rate": 4.148497527678744e-06, - "loss": 0.5013, - "step": 502 - }, - { - "epoch": 2.718918918918919, - "grad_norm": 2.875121593475342, - "learning_rate": 4.145303493043118e-06, - "loss": 0.4109, - "step": 503 - }, - { - "epoch": 2.7243243243243245, - "grad_norm": 2.7204222679138184, - "learning_rate": 4.1421047137790935e-06, - "loss": 0.3197, - "step": 504 - }, - { - "epoch": 2.72972972972973, - "grad_norm": 3.350482702255249, - "learning_rate": 4.13890119911112e-06, - "loss": 0.6369, - "step": 505 - }, - { - "epoch": 2.735135135135135, - "grad_norm": 3.096774101257324, - "learning_rate": 4.135692958277303e-06, - "loss": 0.4581, - "step": 506 - }, - { - "epoch": 2.7405405405405405, - "grad_norm": 2.8896536827087402, - "learning_rate": 4.132480000529375e-06, - "loss": 0.6217, - "step": 507 - }, - { - "epoch": 2.745945945945946, - "grad_norm": 2.643932580947876, - "learning_rate": 4.129262335132676e-06, - "loss": 0.4951, - "step": 508 - }, - { - "epoch": 2.7513513513513512, - "grad_norm": 2.6077864170074463, - "learning_rate": 4.126039971366114e-06, - "loss": 0.2185, - "step": 509 - }, - { - "epoch": 2.756756756756757, - "grad_norm": 2.531507968902588, - "learning_rate": 4.122812918522154e-06, - "loss": 0.5428, - "step": 510 - }, - { - "epoch": 2.762162162162162, - "grad_norm": 4.125836372375488, - "learning_rate": 4.119581185906776e-06, - "loss": 0.5466, - "step": 511 - }, - { - "epoch": 2.7675675675675677, - "grad_norm": 2.9921016693115234, - "learning_rate": 4.1163447828394595e-06, - "loss": 0.3803, - "step": 512 - }, - { - "epoch": 2.772972972972973, - "grad_norm": 2.9517931938171387, - "learning_rate": 4.113103718653152e-06, - "loss": 0.2722, - "step": 513 - }, - { - "epoch": 2.7783783783783784, - "grad_norm": 2.8333382606506348, - "learning_rate": 4.10985800269424e-06, - "loss": 0.333, - "step": 514 - }, - { - "epoch": 2.7837837837837838, - "grad_norm": 2.94168758392334, - "learning_rate": 4.106607644322529e-06, - "loss": 0.2186, - "step": 515 - }, - { - "epoch": 2.789189189189189, - "grad_norm": 3.2743892669677734, - "learning_rate": 4.103352652911207e-06, - "loss": 0.6365, - "step": 516 - }, - { - "epoch": 2.7945945945945945, - "grad_norm": 4.692770004272461, - "learning_rate": 4.100093037846825e-06, - "loss": 0.7261, - "step": 517 - }, - { - "epoch": 2.8, - "grad_norm": 3.2157247066497803, - "learning_rate": 4.0968288085292675e-06, - "loss": 0.2767, - "step": 518 - }, - { - "epoch": 2.8054054054054056, - "grad_norm": 3.196887731552124, - "learning_rate": 4.093559974371725e-06, - "loss": 0.4743, - "step": 519 - }, - { - "epoch": 2.810810810810811, - "grad_norm": 2.406752586364746, - "learning_rate": 4.090286544800667e-06, - "loss": 0.3789, - "step": 520 - }, - { - "epoch": 2.8162162162162163, - "grad_norm": 3.1769447326660156, - "learning_rate": 4.087008529255815e-06, - "loss": 0.6252, - "step": 521 - }, - { - "epoch": 2.8216216216216217, - "grad_norm": 3.068370819091797, - "learning_rate": 4.083725937190115e-06, - "loss": 0.3467, - "step": 522 - }, - { - "epoch": 2.827027027027027, - "grad_norm": 3.2665855884552, - "learning_rate": 4.0804387780697114e-06, - "loss": 0.3857, - "step": 523 - }, - { - "epoch": 2.8324324324324324, - "grad_norm": 3.368759870529175, - "learning_rate": 4.077147061373918e-06, - "loss": 0.4679, - "step": 524 - }, - { - "epoch": 2.8378378378378377, - "grad_norm": 3.989163875579834, - "learning_rate": 4.073850796595192e-06, - "loss": 0.2439, - "step": 525 - }, - { - "epoch": 2.8432432432432435, - "grad_norm": 3.6244685649871826, - "learning_rate": 4.070549993239106e-06, - "loss": 0.435, - "step": 526 - }, - { - "epoch": 2.8486486486486484, - "grad_norm": 3.585151195526123, - "learning_rate": 4.06724466082432e-06, - "loss": 0.5022, - "step": 527 - }, - { - "epoch": 2.854054054054054, - "grad_norm": 3.2420976161956787, - "learning_rate": 4.063934808882555e-06, - "loss": 0.4282, - "step": 528 - }, - { - "epoch": 2.8594594594594596, - "grad_norm": 3.1674294471740723, - "learning_rate": 4.0606204469585656e-06, - "loss": 0.3436, - "step": 529 - }, - { - "epoch": 2.864864864864865, - "grad_norm": 2.6856706142425537, - "learning_rate": 4.057301584610112e-06, - "loss": 0.3889, - "step": 530 - }, - { - "epoch": 2.8702702702702703, - "grad_norm": 3.0438942909240723, - "learning_rate": 4.053978231407931e-06, - "loss": 0.4828, - "step": 531 - }, - { - "epoch": 2.8756756756756756, - "grad_norm": 3.3561246395111084, - "learning_rate": 4.0506503969357115e-06, - "loss": 0.5814, - "step": 532 - }, - { - "epoch": 2.881081081081081, - "grad_norm": 2.5318350791931152, - "learning_rate": 4.047318090790065e-06, - "loss": 0.4768, - "step": 533 - }, - { - "epoch": 2.8864864864864863, - "grad_norm": 2.587224006652832, - "learning_rate": 4.043981322580498e-06, - "loss": 0.4262, - "step": 534 - }, - { - "epoch": 2.891891891891892, - "grad_norm": 2.73926043510437, - "learning_rate": 4.040640101929384e-06, - "loss": 0.421, - "step": 535 - }, - { - "epoch": 2.8972972972972975, - "grad_norm": 3.53908371925354, - "learning_rate": 4.037294438471936e-06, - "loss": 0.4019, - "step": 536 - }, - { - "epoch": 2.902702702702703, - "grad_norm": 3.0980448722839355, - "learning_rate": 4.033944341856181e-06, - "loss": 0.4322, - "step": 537 - }, - { - "epoch": 2.908108108108108, - "grad_norm": 2.9265666007995605, - "learning_rate": 4.030589821742926e-06, - "loss": 0.3841, - "step": 538 - }, - { - "epoch": 2.9135135135135135, - "grad_norm": 3.4082043170928955, - "learning_rate": 4.0272308878057385e-06, - "loss": 0.7083, - "step": 539 - }, - { - "epoch": 2.918918918918919, - "grad_norm": 3.297515630722046, - "learning_rate": 4.023867549730912e-06, - "loss": 0.5688, - "step": 540 - }, - { - "epoch": 2.924324324324324, - "grad_norm": 3.0538225173950195, - "learning_rate": 4.020499817217441e-06, - "loss": 0.5979, - "step": 541 - }, - { - "epoch": 2.92972972972973, - "grad_norm": 3.1792757511138916, - "learning_rate": 4.017127699976992e-06, - "loss": 0.5034, - "step": 542 - }, - { - "epoch": 2.935135135135135, - "grad_norm": 3.1574482917785645, - "learning_rate": 4.013751207733877e-06, - "loss": 0.6656, - "step": 543 - }, - { - "epoch": 2.9405405405405407, - "grad_norm": 2.523123264312744, - "learning_rate": 4.010370350225023e-06, - "loss": 0.2789, - "step": 544 - }, - { - "epoch": 2.945945945945946, - "grad_norm": 3.1950793266296387, - "learning_rate": 4.006985137199945e-06, - "loss": 0.2163, - "step": 545 - }, - { - "epoch": 2.9513513513513514, - "grad_norm": 3.2089648246765137, - "learning_rate": 4.00359557842072e-06, - "loss": 0.4179, - "step": 546 - }, - { - "epoch": 2.9567567567567568, - "grad_norm": 3.852578639984131, - "learning_rate": 4.000201683661958e-06, - "loss": 0.4683, - "step": 547 - }, - { - "epoch": 2.962162162162162, - "grad_norm": 2.7612597942352295, - "learning_rate": 3.996803462710766e-06, - "loss": 0.3506, - "step": 548 - }, - { - "epoch": 2.9675675675675675, - "grad_norm": 4.811823844909668, - "learning_rate": 3.993400925366736e-06, - "loss": 0.6582, - "step": 549 - }, - { - "epoch": 2.972972972972973, - "grad_norm": 3.0135858058929443, - "learning_rate": 3.989994081441902e-06, - "loss": 0.504, - "step": 550 - }, - { - "epoch": 2.9783783783783786, - "grad_norm": 2.710277795791626, - "learning_rate": 3.986582940760717e-06, - "loss": 0.7362, - "step": 551 - }, - { - "epoch": 2.983783783783784, - "grad_norm": 3.175443649291992, - "learning_rate": 3.983167513160025e-06, - "loss": 0.4116, - "step": 552 - }, - { - "epoch": 2.9891891891891893, - "grad_norm": 3.101109743118286, - "learning_rate": 3.979747808489036e-06, - "loss": 0.2188, - "step": 553 - }, - { - "epoch": 2.9945945945945946, - "grad_norm": 3.2320079803466797, - "learning_rate": 3.976323836609289e-06, - "loss": 0.7558, - "step": 554 - }, - { - "epoch": 3.0, - "grad_norm": 3.6071934700012207, - "learning_rate": 3.9728956073946305e-06, - "loss": 0.6491, - "step": 555 - } - ], - "logging_steps": 1, - "max_steps": 1850, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.4968059638002483e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00001-of-00007.safetensors deleted file mode 100644 index bc7236b5079d81fdf53edba1f7a60b7f3aa8081f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd78b3f23d5011bbe0e4876faed3d92b1d625240eb07a9aef8cfabb622ac8f42 -size 4886466168 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00002-of-00007.safetensors deleted file mode 100644 index f180c0ac4f66c0940669eceab8760b771efa526e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1f8056ffcdbf64f5480dfc7d7edbe470c0f878ed9e5350d029ad44b9d468f075 -size 4832007448 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00003-of-00007.safetensors deleted file mode 100644 index a34d83d60556dcf0596ff8d2c1eafc471d1f3b07..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6835e3e21e0726f747fa2e75ca76587590c430163fd4b95fac89b8a8ff376b6 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00004-of-00007.safetensors deleted file mode 100644 index 3e4c37d5a3a66d6f9c65abcc4bd1188036cb448e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5290573230bdd89ccf8de08d77096d7502b6ca60098ca754e93c966de3a199a -size 4999813128 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00005-of-00007.safetensors deleted file mode 100644 index fd39ad1e7def76e1887abec939f15458ab572771..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc787a4023938bd3e1fc913107d4f007044be2cd25a9047fd13a172ea45983ab -size 4832007496 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00006-of-00007.safetensors deleted file mode 100644 index a0f1937d8803a97f750c537061544e22205d51a8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c8944716d35b704b3b4ddcfbb78067a4d12c823f643a182ebc3e0a22a4ac569 -size 4999813120 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00007-of-00007.safetensors deleted file mode 100644 index 42f1eb755cb80ce2ed32457b02592d70db785482..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:50c6166931dd037983b218fe777e21598235da83d6dcd7cec8372d069d4bb49f -size 2571158184 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_0.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_0.pth deleted file mode 100644 index f8799407442db08820f995bcf1b9158f696af19f..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70cc56408014c410353d4dd58ae9b03f4be043f5f800324f66fd8e20e99b840e -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_1.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_1.pth deleted file mode 100644 index aa0c3c6aeaabc038c714a3fcc9b78d186a4cab59..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49d1438e98cc9c53a6852464635ce62e9788e61eb3646b73e33813f487c4b6ae -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_2.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_2.pth deleted file mode 100644 index 0f39416636e7990907141a415603582d33812fc9..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4388add9cec90932f8ff0100d27a0574d98e1bad52ff89d44e31967d2b4fbfde -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_3.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_3.pth deleted file mode 100644 index d3775bcd497f8ad74ece6675e0bbda89fb7ee6f4..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a705d6dfaae4f2c1b4b2be6b25a6eb521ffae6fcba21cc1531e97b60037ed079 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/scheduler.pt b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/scheduler.pt deleted file mode 100644 index efde8563fb4bb6a77bb0072fab2ba9d97ce1ec17..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c6963e46dae8819e38df5a4767dcb4d2766d3d338a2121dba18529c9608e364 -size 1064 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-740/trainer_state.json deleted file mode 100644 index 6b4218da5adb2605ce95b0220be4741081fd64ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-740/trainer_state.json +++ /dev/null @@ -1,5214 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 4.0, - "eval_steps": 500, - "global_step": 740, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.005405405405405406, - "grad_norm": 72.60939025878906, - "learning_rate": 5e-06, - "loss": 2.9165, - "step": 1 - }, - { - "epoch": 0.010810810810810811, - "grad_norm": 29.01830291748047, - "learning_rate": 4.999996395324314e-06, - "loss": 1.9314, - "step": 2 - }, - { - "epoch": 0.016216216216216217, - "grad_norm": 21.44908332824707, - "learning_rate": 4.99998558130765e-06, - "loss": 1.5709, - "step": 3 - }, - { - "epoch": 0.021621621621621623, - "grad_norm": 4.490907669067383, - "learning_rate": 4.999967557981192e-06, - "loss": 0.8099, - "step": 4 - }, - { - "epoch": 0.02702702702702703, - "grad_norm": 4.000796794891357, - "learning_rate": 4.999942325396917e-06, - "loss": 0.9021, - "step": 5 - }, - { - "epoch": 0.032432432432432434, - "grad_norm": 18.513282775878906, - "learning_rate": 4.999909883627588e-06, - "loss": 1.7972, - "step": 6 - }, - { - "epoch": 0.03783783783783784, - "grad_norm": 3.5735981464385986, - "learning_rate": 4.999870232766757e-06, - "loss": 1.4306, - "step": 7 - }, - { - "epoch": 0.043243243243243246, - "grad_norm": 3.1145193576812744, - "learning_rate": 4.9998233729287696e-06, - "loss": 1.051, - "step": 8 - }, - { - "epoch": 0.04864864864864865, - "grad_norm": 3.856376886367798, - "learning_rate": 4.999769304248755e-06, - "loss": 0.8089, - "step": 9 - }, - { - "epoch": 0.05405405405405406, - "grad_norm": 4.05589485168457, - "learning_rate": 4.9997080268826344e-06, - "loss": 1.0999, - "step": 10 - }, - { - "epoch": 0.05945945945945946, - "grad_norm": 13.784229278564453, - "learning_rate": 4.9996395410071165e-06, - "loss": 1.2831, - "step": 11 - }, - { - "epoch": 0.06486486486486487, - "grad_norm": 6.079237937927246, - "learning_rate": 4.999563846819696e-06, - "loss": 1.2874, - "step": 12 - }, - { - "epoch": 0.07027027027027027, - "grad_norm": 4.5971245765686035, - "learning_rate": 4.999480944538655e-06, - "loss": 0.96, - "step": 13 - }, - { - "epoch": 0.07567567567567568, - "grad_norm": 4.916017532348633, - "learning_rate": 4.999390834403063e-06, - "loss": 0.9869, - "step": 14 - }, - { - "epoch": 0.08108108108108109, - "grad_norm": 3.2311055660247803, - "learning_rate": 4.999293516672773e-06, - "loss": 0.9293, - "step": 15 - }, - { - "epoch": 0.08648648648648649, - "grad_norm": 3.3040921688079834, - "learning_rate": 4.9991889916284255e-06, - "loss": 0.8914, - "step": 16 - }, - { - "epoch": 0.0918918918918919, - "grad_norm": 3.794267416000366, - "learning_rate": 4.999077259571442e-06, - "loss": 1.0176, - "step": 17 - }, - { - "epoch": 0.0972972972972973, - "grad_norm": 4.788509845733643, - "learning_rate": 4.998958320824031e-06, - "loss": 1.0259, - "step": 18 - }, - { - "epoch": 0.10270270270270271, - "grad_norm": 10.027527809143066, - "learning_rate": 4.998832175729179e-06, - "loss": 1.3356, - "step": 19 - }, - { - "epoch": 0.10810810810810811, - "grad_norm": 4.612483978271484, - "learning_rate": 4.998698824650656e-06, - "loss": 1.4486, - "step": 20 - }, - { - "epoch": 0.11351351351351352, - "grad_norm": 3.8676936626434326, - "learning_rate": 4.998558267973014e-06, - "loss": 0.8372, - "step": 21 - }, - { - "epoch": 0.11891891891891893, - "grad_norm": 2.9611001014709473, - "learning_rate": 4.998410506101579e-06, - "loss": 0.7931, - "step": 22 - }, - { - "epoch": 0.12432432432432433, - "grad_norm": 5.508745193481445, - "learning_rate": 4.9982555394624595e-06, - "loss": 1.3022, - "step": 23 - }, - { - "epoch": 0.12972972972972974, - "grad_norm": 3.434845209121704, - "learning_rate": 4.998093368502539e-06, - "loss": 0.9739, - "step": 24 - }, - { - "epoch": 0.13513513513513514, - "grad_norm": 4.736802101135254, - "learning_rate": 4.9979239936894765e-06, - "loss": 1.1154, - "step": 25 - }, - { - "epoch": 0.14054054054054055, - "grad_norm": 3.69411039352417, - "learning_rate": 4.997747415511705e-06, - "loss": 0.7543, - "step": 26 - }, - { - "epoch": 0.14594594594594595, - "grad_norm": 2.8646645545959473, - "learning_rate": 4.997563634478428e-06, - "loss": 0.7278, - "step": 27 - }, - { - "epoch": 0.15135135135135136, - "grad_norm": 6.56904935836792, - "learning_rate": 4.997372651119626e-06, - "loss": 0.8167, - "step": 28 - }, - { - "epoch": 0.15675675675675677, - "grad_norm": 2.955914258956909, - "learning_rate": 4.997174465986044e-06, - "loss": 0.8031, - "step": 29 - }, - { - "epoch": 0.16216216216216217, - "grad_norm": 2.5714259147644043, - "learning_rate": 4.996969079649196e-06, - "loss": 0.689, - "step": 30 - }, - { - "epoch": 0.16756756756756758, - "grad_norm": 3.5165364742279053, - "learning_rate": 4.996756492701362e-06, - "loss": 0.8059, - "step": 31 - }, - { - "epoch": 0.17297297297297298, - "grad_norm": 3.2861921787261963, - "learning_rate": 4.996536705755591e-06, - "loss": 0.9658, - "step": 32 - }, - { - "epoch": 0.1783783783783784, - "grad_norm": 2.962470531463623, - "learning_rate": 4.996309719445687e-06, - "loss": 0.8349, - "step": 33 - }, - { - "epoch": 0.1837837837837838, - "grad_norm": 2.7694804668426514, - "learning_rate": 4.996075534426223e-06, - "loss": 0.8287, - "step": 34 - }, - { - "epoch": 0.1891891891891892, - "grad_norm": 3.405071258544922, - "learning_rate": 4.995834151372526e-06, - "loss": 1.1211, - "step": 35 - }, - { - "epoch": 0.1945945945945946, - "grad_norm": 2.8680710792541504, - "learning_rate": 4.995585570980685e-06, - "loss": 1.0841, - "step": 36 - }, - { - "epoch": 0.2, - "grad_norm": 3.341021776199341, - "learning_rate": 4.995329793967537e-06, - "loss": 0.6182, - "step": 37 - }, - { - "epoch": 0.20540540540540542, - "grad_norm": 3.0639379024505615, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.7647, - "step": 38 - }, - { - "epoch": 0.21081081081081082, - "grad_norm": 3.225759983062744, - "learning_rate": 4.994796653048457e-06, - "loss": 0.8691, - "step": 39 - }, - { - "epoch": 0.21621621621621623, - "grad_norm": 4.56926155090332, - "learning_rate": 4.994519290679965e-06, - "loss": 1.0404, - "step": 40 - }, - { - "epoch": 0.22162162162162163, - "grad_norm": 4.871571063995361, - "learning_rate": 4.994234734765043e-06, - "loss": 1.1877, - "step": 41 - }, - { - "epoch": 0.22702702702702704, - "grad_norm": 3.672215700149536, - "learning_rate": 4.993942986124278e-06, - "loss": 0.959, - "step": 42 - }, - { - "epoch": 0.23243243243243245, - "grad_norm": 3.184683322906494, - "learning_rate": 4.9936440455989975e-06, - "loss": 0.9249, - "step": 43 - }, - { - "epoch": 0.23783783783783785, - "grad_norm": 2.7092034816741943, - "learning_rate": 4.993337914051266e-06, - "loss": 0.6899, - "step": 44 - }, - { - "epoch": 0.24324324324324326, - "grad_norm": 3.153764486312866, - "learning_rate": 4.99302459236389e-06, - "loss": 0.9075, - "step": 45 - }, - { - "epoch": 0.24864864864864866, - "grad_norm": 3.3629748821258545, - "learning_rate": 4.992704081440407e-06, - "loss": 0.785, - "step": 46 - }, - { - "epoch": 0.25405405405405407, - "grad_norm": 4.478365898132324, - "learning_rate": 4.992376382205088e-06, - "loss": 1.008, - "step": 47 - }, - { - "epoch": 0.2594594594594595, - "grad_norm": 3.4001641273498535, - "learning_rate": 4.992041495602932e-06, - "loss": 0.7751, - "step": 48 - }, - { - "epoch": 0.2648648648648649, - "grad_norm": 2.522662878036499, - "learning_rate": 4.991699422599664e-06, - "loss": 0.9022, - "step": 49 - }, - { - "epoch": 0.2702702702702703, - "grad_norm": 2.764458179473877, - "learning_rate": 4.991350164181735e-06, - "loss": 0.8801, - "step": 50 - }, - { - "epoch": 0.2756756756756757, - "grad_norm": 2.814859628677368, - "learning_rate": 4.990993721356317e-06, - "loss": 0.7045, - "step": 51 - }, - { - "epoch": 0.2810810810810811, - "grad_norm": 2.441311836242676, - "learning_rate": 4.990630095151296e-06, - "loss": 0.7312, - "step": 52 - }, - { - "epoch": 0.2864864864864865, - "grad_norm": 2.4443013668060303, - "learning_rate": 4.9902592866152765e-06, - "loss": 0.9609, - "step": 53 - }, - { - "epoch": 0.2918918918918919, - "grad_norm": 2.2934701442718506, - "learning_rate": 4.989881296817575e-06, - "loss": 0.5753, - "step": 54 - }, - { - "epoch": 0.2972972972972973, - "grad_norm": 2.6286847591400146, - "learning_rate": 4.989496126848215e-06, - "loss": 0.5118, - "step": 55 - }, - { - "epoch": 0.3027027027027027, - "grad_norm": 3.6817069053649902, - "learning_rate": 4.989103777817928e-06, - "loss": 1.1261, - "step": 56 - }, - { - "epoch": 0.3081081081081081, - "grad_norm": 3.011197566986084, - "learning_rate": 4.988704250858145e-06, - "loss": 0.7823, - "step": 57 - }, - { - "epoch": 0.31351351351351353, - "grad_norm": 2.5490806102752686, - "learning_rate": 4.988297547121e-06, - "loss": 0.6019, - "step": 58 - }, - { - "epoch": 0.31891891891891894, - "grad_norm": 3.0803146362304688, - "learning_rate": 4.98788366777932e-06, - "loss": 0.825, - "step": 59 - }, - { - "epoch": 0.32432432432432434, - "grad_norm": 3.015730619430542, - "learning_rate": 4.987462614026625e-06, - "loss": 0.7667, - "step": 60 - }, - { - "epoch": 0.32972972972972975, - "grad_norm": 2.5371594429016113, - "learning_rate": 4.987034387077126e-06, - "loss": 0.8051, - "step": 61 - }, - { - "epoch": 0.33513513513513515, - "grad_norm": 2.6414010524749756, - "learning_rate": 4.986598988165718e-06, - "loss": 0.6895, - "step": 62 - }, - { - "epoch": 0.34054054054054056, - "grad_norm": 3.065131187438965, - "learning_rate": 4.9861564185479785e-06, - "loss": 0.9268, - "step": 63 - }, - { - "epoch": 0.34594594594594597, - "grad_norm": 2.5708694458007812, - "learning_rate": 4.985706679500163e-06, - "loss": 0.9854, - "step": 64 - }, - { - "epoch": 0.35135135135135137, - "grad_norm": 2.768915891647339, - "learning_rate": 4.9852497723192025e-06, - "loss": 0.8083, - "step": 65 - }, - { - "epoch": 0.3567567567567568, - "grad_norm": 2.567901849746704, - "learning_rate": 4.9847856983227e-06, - "loss": 0.9098, - "step": 66 - }, - { - "epoch": 0.3621621621621622, - "grad_norm": 2.5766549110412598, - "learning_rate": 4.984314458848923e-06, - "loss": 0.8881, - "step": 67 - }, - { - "epoch": 0.3675675675675676, - "grad_norm": 2.9778389930725098, - "learning_rate": 4.983836055256804e-06, - "loss": 0.9877, - "step": 68 - }, - { - "epoch": 0.372972972972973, - "grad_norm": 2.7225165367126465, - "learning_rate": 4.983350488925935e-06, - "loss": 0.8282, - "step": 69 - }, - { - "epoch": 0.3783783783783784, - "grad_norm": 2.702287197113037, - "learning_rate": 4.982857761256564e-06, - "loss": 1.1756, - "step": 70 - }, - { - "epoch": 0.3837837837837838, - "grad_norm": 2.9815568923950195, - "learning_rate": 4.982357873669589e-06, - "loss": 0.8114, - "step": 71 - }, - { - "epoch": 0.3891891891891892, - "grad_norm": 3.27150297164917, - "learning_rate": 4.981850827606556e-06, - "loss": 0.6763, - "step": 72 - }, - { - "epoch": 0.3945945945945946, - "grad_norm": 2.568423271179199, - "learning_rate": 4.981336624529655e-06, - "loss": 0.9372, - "step": 73 - }, - { - "epoch": 0.4, - "grad_norm": 2.621175527572632, - "learning_rate": 4.980815265921714e-06, - "loss": 1.0155, - "step": 74 - }, - { - "epoch": 0.40540540540540543, - "grad_norm": 2.62827205657959, - "learning_rate": 4.980286753286196e-06, - "loss": 0.949, - "step": 75 - }, - { - "epoch": 0.41081081081081083, - "grad_norm": 2.9462146759033203, - "learning_rate": 4.979751088147192e-06, - "loss": 1.0134, - "step": 76 - }, - { - "epoch": 0.41621621621621624, - "grad_norm": 2.814852714538574, - "learning_rate": 4.979208272049425e-06, - "loss": 0.9722, - "step": 77 - }, - { - "epoch": 0.42162162162162165, - "grad_norm": 4.177679538726807, - "learning_rate": 4.978658306558235e-06, - "loss": 1.2259, - "step": 78 - }, - { - "epoch": 0.42702702702702705, - "grad_norm": 2.813084125518799, - "learning_rate": 4.978101193259578e-06, - "loss": 0.834, - "step": 79 - }, - { - "epoch": 0.43243243243243246, - "grad_norm": 2.71824049949646, - "learning_rate": 4.977536933760025e-06, - "loss": 0.6151, - "step": 80 - }, - { - "epoch": 0.43783783783783786, - "grad_norm": 4.992153167724609, - "learning_rate": 4.976965529686755e-06, - "loss": 1.0475, - "step": 81 - }, - { - "epoch": 0.44324324324324327, - "grad_norm": 2.4810822010040283, - "learning_rate": 4.976386982687548e-06, - "loss": 0.8324, - "step": 82 - }, - { - "epoch": 0.4486486486486487, - "grad_norm": 4.509149074554443, - "learning_rate": 4.9758012944307845e-06, - "loss": 0.997, - "step": 83 - }, - { - "epoch": 0.4540540540540541, - "grad_norm": 3.114325761795044, - "learning_rate": 4.975208466605436e-06, - "loss": 1.2024, - "step": 84 - }, - { - "epoch": 0.4594594594594595, - "grad_norm": 3.297091007232666, - "learning_rate": 4.974608500921064e-06, - "loss": 0.9146, - "step": 85 - }, - { - "epoch": 0.4648648648648649, - "grad_norm": 2.824475049972534, - "learning_rate": 4.974001399107816e-06, - "loss": 0.7181, - "step": 86 - }, - { - "epoch": 0.4702702702702703, - "grad_norm": 20.262290954589844, - "learning_rate": 4.973387162916415e-06, - "loss": 0.8599, - "step": 87 - }, - { - "epoch": 0.4756756756756757, - "grad_norm": 4.015744686126709, - "learning_rate": 4.972765794118158e-06, - "loss": 0.6081, - "step": 88 - }, - { - "epoch": 0.4810810810810811, - "grad_norm": 2.8033058643341064, - "learning_rate": 4.9721372945049114e-06, - "loss": 0.8764, - "step": 89 - }, - { - "epoch": 0.4864864864864865, - "grad_norm": 5.271846294403076, - "learning_rate": 4.971501665889107e-06, - "loss": 0.8622, - "step": 90 - }, - { - "epoch": 0.4918918918918919, - "grad_norm": 2.557264804840088, - "learning_rate": 4.9708589101037306e-06, - "loss": 0.5523, - "step": 91 - }, - { - "epoch": 0.4972972972972973, - "grad_norm": 4.342173099517822, - "learning_rate": 4.970209029002325e-06, - "loss": 0.8922, - "step": 92 - }, - { - "epoch": 0.5027027027027027, - "grad_norm": 2.950364351272583, - "learning_rate": 4.969552024458977e-06, - "loss": 0.9455, - "step": 93 - }, - { - "epoch": 0.5081081081081081, - "grad_norm": 2.6453042030334473, - "learning_rate": 4.968887898368318e-06, - "loss": 0.8342, - "step": 94 - }, - { - "epoch": 0.5135135135135135, - "grad_norm": 3.486766815185547, - "learning_rate": 4.968216652645515e-06, - "loss": 0.8476, - "step": 95 - }, - { - "epoch": 0.518918918918919, - "grad_norm": 2.884152889251709, - "learning_rate": 4.967538289226268e-06, - "loss": 0.8879, - "step": 96 - }, - { - "epoch": 0.5243243243243243, - "grad_norm": 2.4130594730377197, - "learning_rate": 4.966852810066798e-06, - "loss": 0.7114, - "step": 97 - }, - { - "epoch": 0.5297297297297298, - "grad_norm": 3.182410955429077, - "learning_rate": 4.9661602171438524e-06, - "loss": 0.6757, - "step": 98 - }, - { - "epoch": 0.5351351351351351, - "grad_norm": 2.5027542114257812, - "learning_rate": 4.965460512454687e-06, - "loss": 0.8029, - "step": 99 - }, - { - "epoch": 0.5405405405405406, - "grad_norm": 2.3096024990081787, - "learning_rate": 4.964753698017071e-06, - "loss": 0.842, - "step": 100 - }, - { - "epoch": 0.5459459459459459, - "grad_norm": 2.875657081604004, - "learning_rate": 4.964039775869271e-06, - "loss": 0.6339, - "step": 101 - }, - { - "epoch": 0.5513513513513514, - "grad_norm": 2.505406141281128, - "learning_rate": 4.963318748070056e-06, - "loss": 0.7743, - "step": 102 - }, - { - "epoch": 0.5567567567567567, - "grad_norm": 3.552562713623047, - "learning_rate": 4.9625906166986815e-06, - "loss": 0.926, - "step": 103 - }, - { - "epoch": 0.5621621621621622, - "grad_norm": 2.717942476272583, - "learning_rate": 4.961855383854889e-06, - "loss": 0.7037, - "step": 104 - }, - { - "epoch": 0.5675675675675675, - "grad_norm": 2.5049386024475098, - "learning_rate": 4.961113051658901e-06, - "loss": 0.561, - "step": 105 - }, - { - "epoch": 0.572972972972973, - "grad_norm": 2.3112900257110596, - "learning_rate": 4.96036362225141e-06, - "loss": 0.7316, - "step": 106 - }, - { - "epoch": 0.5783783783783784, - "grad_norm": 2.470257520675659, - "learning_rate": 4.959607097793575e-06, - "loss": 0.6426, - "step": 107 - }, - { - "epoch": 0.5837837837837838, - "grad_norm": 3.8040788173675537, - "learning_rate": 4.9588434804670176e-06, - "loss": 1.0044, - "step": 108 - }, - { - "epoch": 0.5891891891891892, - "grad_norm": 3.143547296524048, - "learning_rate": 4.958072772473812e-06, - "loss": 0.9219, - "step": 109 - }, - { - "epoch": 0.5945945945945946, - "grad_norm": 3.5052590370178223, - "learning_rate": 4.9572949760364795e-06, - "loss": 0.6056, - "step": 110 - }, - { - "epoch": 0.6, - "grad_norm": 3.064009428024292, - "learning_rate": 4.9565100933979835e-06, - "loss": 0.6346, - "step": 111 - }, - { - "epoch": 0.6054054054054054, - "grad_norm": 2.694610595703125, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.9856, - "step": 112 - }, - { - "epoch": 0.6108108108108108, - "grad_norm": 2.5885775089263916, - "learning_rate": 4.954919078591521e-06, - "loss": 0.8669, - "step": 113 - }, - { - "epoch": 0.6162162162162163, - "grad_norm": 2.593609571456909, - "learning_rate": 4.954112951011628e-06, - "loss": 0.7201, - "step": 114 - }, - { - "epoch": 0.6216216216216216, - "grad_norm": 3.3045759201049805, - "learning_rate": 4.9532997464067065e-06, - "loss": 0.9095, - "step": 115 - }, - { - "epoch": 0.6270270270270271, - "grad_norm": 2.8144869804382324, - "learning_rate": 4.952479467121828e-06, - "loss": 1.0213, - "step": 116 - }, - { - "epoch": 0.6324324324324324, - "grad_norm": 2.5460312366485596, - "learning_rate": 4.951652115522463e-06, - "loss": 1.1154, - "step": 117 - }, - { - "epoch": 0.6378378378378379, - "grad_norm": 2.795137405395508, - "learning_rate": 4.950817693994481e-06, - "loss": 0.691, - "step": 118 - }, - { - "epoch": 0.6432432432432432, - "grad_norm": 2.4979195594787598, - "learning_rate": 4.949976204944135e-06, - "loss": 0.7224, - "step": 119 - }, - { - "epoch": 0.6486486486486487, - "grad_norm": 3.3131983280181885, - "learning_rate": 4.949127650798063e-06, - "loss": 0.9256, - "step": 120 - }, - { - "epoch": 0.654054054054054, - "grad_norm": 2.9060285091400146, - "learning_rate": 4.948272034003275e-06, - "loss": 0.6892, - "step": 121 - }, - { - "epoch": 0.6594594594594595, - "grad_norm": 3.695594549179077, - "learning_rate": 4.947409357027148e-06, - "loss": 0.5878, - "step": 122 - }, - { - "epoch": 0.6648648648648648, - "grad_norm": 3.1250460147857666, - "learning_rate": 4.9465396223574165e-06, - "loss": 0.9904, - "step": 123 - }, - { - "epoch": 0.6702702702702703, - "grad_norm": 4.024891376495361, - "learning_rate": 4.945662832502172e-06, - "loss": 1.1592, - "step": 124 - }, - { - "epoch": 0.6756756756756757, - "grad_norm": 2.6886494159698486, - "learning_rate": 4.944778989989847e-06, - "loss": 1.0041, - "step": 125 - }, - { - "epoch": 0.6810810810810811, - "grad_norm": 2.366912841796875, - "learning_rate": 4.943888097369216e-06, - "loss": 0.7045, - "step": 126 - }, - { - "epoch": 0.6864864864864865, - "grad_norm": 2.394932270050049, - "learning_rate": 4.942990157209381e-06, - "loss": 0.6685, - "step": 127 - }, - { - "epoch": 0.6918918918918919, - "grad_norm": 2.61933970451355, - "learning_rate": 4.9420851720997674e-06, - "loss": 0.8812, - "step": 128 - }, - { - "epoch": 0.6972972972972973, - "grad_norm": 2.7395646572113037, - "learning_rate": 4.94117314465012e-06, - "loss": 1.3014, - "step": 129 - }, - { - "epoch": 0.7027027027027027, - "grad_norm": 3.065484046936035, - "learning_rate": 4.940254077490487e-06, - "loss": 0.6978, - "step": 130 - }, - { - "epoch": 0.7081081081081081, - "grad_norm": 2.895038366317749, - "learning_rate": 4.939327973271222e-06, - "loss": 0.6249, - "step": 131 - }, - { - "epoch": 0.7135135135135136, - "grad_norm": 3.1773312091827393, - "learning_rate": 4.9383948346629665e-06, - "loss": 0.6423, - "step": 132 - }, - { - "epoch": 0.7189189189189189, - "grad_norm": 2.2378008365631104, - "learning_rate": 4.937454664356652e-06, - "loss": 0.7193, - "step": 133 - }, - { - "epoch": 0.7243243243243244, - "grad_norm": 2.5673701763153076, - "learning_rate": 4.9365074650634855e-06, - "loss": 0.7065, - "step": 134 - }, - { - "epoch": 0.7297297297297297, - "grad_norm": 2.7348387241363525, - "learning_rate": 4.9355532395149445e-06, - "loss": 1.0046, - "step": 135 - }, - { - "epoch": 0.7351351351351352, - "grad_norm": 2.391741991043091, - "learning_rate": 4.9345919904627655e-06, - "loss": 0.6771, - "step": 136 - }, - { - "epoch": 0.7405405405405405, - "grad_norm": 2.2096705436706543, - "learning_rate": 4.933623720678944e-06, - "loss": 0.6589, - "step": 137 - }, - { - "epoch": 0.745945945945946, - "grad_norm": 3.0840072631835938, - "learning_rate": 4.932648432955718e-06, - "loss": 0.8755, - "step": 138 - }, - { - "epoch": 0.7513513513513513, - "grad_norm": 2.4970428943634033, - "learning_rate": 4.931666130105564e-06, - "loss": 0.6685, - "step": 139 - }, - { - "epoch": 0.7567567567567568, - "grad_norm": 4.315455436706543, - "learning_rate": 4.930676814961189e-06, - "loss": 0.8101, - "step": 140 - }, - { - "epoch": 0.7621621621621621, - "grad_norm": 5.388065814971924, - "learning_rate": 4.92968049037552e-06, - "loss": 0.8193, - "step": 141 - }, - { - "epoch": 0.7675675675675676, - "grad_norm": 2.6107139587402344, - "learning_rate": 4.9286771592217005e-06, - "loss": 0.7852, - "step": 142 - }, - { - "epoch": 0.772972972972973, - "grad_norm": 3.936556577682495, - "learning_rate": 4.927666824393076e-06, - "loss": 1.0388, - "step": 143 - }, - { - "epoch": 0.7783783783783784, - "grad_norm": 2.74424409866333, - "learning_rate": 4.926649488803191e-06, - "loss": 0.8266, - "step": 144 - }, - { - "epoch": 0.7837837837837838, - "grad_norm": 2.8998451232910156, - "learning_rate": 4.925625155385776e-06, - "loss": 0.4895, - "step": 145 - }, - { - "epoch": 0.7891891891891892, - "grad_norm": 3.0631520748138428, - "learning_rate": 4.924593827094743e-06, - "loss": 0.8759, - "step": 146 - }, - { - "epoch": 0.7945945945945946, - "grad_norm": 3.233267307281494, - "learning_rate": 4.923555506904176e-06, - "loss": 0.701, - "step": 147 - }, - { - "epoch": 0.8, - "grad_norm": 2.87701416015625, - "learning_rate": 4.922510197808321e-06, - "loss": 1.1327, - "step": 148 - }, - { - "epoch": 0.8054054054054054, - "grad_norm": 3.650576114654541, - "learning_rate": 4.921457902821578e-06, - "loss": 0.7587, - "step": 149 - }, - { - "epoch": 0.8108108108108109, - "grad_norm": 3.232112407684326, - "learning_rate": 4.920398624978493e-06, - "loss": 1.2158, - "step": 150 - }, - { - "epoch": 0.8162162162162162, - "grad_norm": 2.468384027481079, - "learning_rate": 4.919332367333748e-06, - "loss": 0.6852, - "step": 151 - }, - { - "epoch": 0.8216216216216217, - "grad_norm": 2.5947415828704834, - "learning_rate": 4.918259132962154e-06, - "loss": 0.6611, - "step": 152 - }, - { - "epoch": 0.827027027027027, - "grad_norm": 3.0171427726745605, - "learning_rate": 4.917178924958638e-06, - "loss": 0.7327, - "step": 153 - }, - { - "epoch": 0.8324324324324325, - "grad_norm": 3.293184518814087, - "learning_rate": 4.916091746438243e-06, - "loss": 0.8528, - "step": 154 - }, - { - "epoch": 0.8378378378378378, - "grad_norm": 4.0570969581604, - "learning_rate": 4.9149976005361085e-06, - "loss": 0.9141, - "step": 155 - }, - { - "epoch": 0.8432432432432433, - "grad_norm": 2.8782784938812256, - "learning_rate": 4.913896490407467e-06, - "loss": 1.1132, - "step": 156 - }, - { - "epoch": 0.8486486486486486, - "grad_norm": 2.5671517848968506, - "learning_rate": 4.912788419227635e-06, - "loss": 0.7587, - "step": 157 - }, - { - "epoch": 0.8540540540540541, - "grad_norm": 2.9445390701293945, - "learning_rate": 4.911673390192002e-06, - "loss": 0.9227, - "step": 158 - }, - { - "epoch": 0.8594594594594595, - "grad_norm": 2.472595453262329, - "learning_rate": 4.910551406516023e-06, - "loss": 0.8154, - "step": 159 - }, - { - "epoch": 0.8648648648648649, - "grad_norm": 2.5233397483825684, - "learning_rate": 4.909422471435207e-06, - "loss": 0.9897, - "step": 160 - }, - { - "epoch": 0.8702702702702703, - "grad_norm": 3.3919546604156494, - "learning_rate": 4.90828658820511e-06, - "loss": 0.6162, - "step": 161 - }, - { - "epoch": 0.8756756756756757, - "grad_norm": 3.060908555984497, - "learning_rate": 4.907143760101325e-06, - "loss": 0.5734, - "step": 162 - }, - { - "epoch": 0.8810810810810811, - "grad_norm": 3.4584782123565674, - "learning_rate": 4.905993990419472e-06, - "loss": 0.8328, - "step": 163 - }, - { - "epoch": 0.8864864864864865, - "grad_norm": 2.936570644378662, - "learning_rate": 4.904837282475187e-06, - "loss": 0.6787, - "step": 164 - }, - { - "epoch": 0.8918918918918919, - "grad_norm": 2.564837694168091, - "learning_rate": 4.9036736396041165e-06, - "loss": 0.9658, - "step": 165 - }, - { - "epoch": 0.8972972972972973, - "grad_norm": 3.2509360313415527, - "learning_rate": 4.902503065161905e-06, - "loss": 0.7899, - "step": 166 - }, - { - "epoch": 0.9027027027027027, - "grad_norm": 2.9730329513549805, - "learning_rate": 4.901325562524185e-06, - "loss": 0.9476, - "step": 167 - }, - { - "epoch": 0.9081081081081082, - "grad_norm": 3.044980049133301, - "learning_rate": 4.900141135086569e-06, - "loss": 0.7589, - "step": 168 - }, - { - "epoch": 0.9135135135135135, - "grad_norm": 3.030585527420044, - "learning_rate": 4.898949786264638e-06, - "loss": 0.6724, - "step": 169 - }, - { - "epoch": 0.918918918918919, - "grad_norm": 2.249122142791748, - "learning_rate": 4.897751519493933e-06, - "loss": 0.6968, - "step": 170 - }, - { - "epoch": 0.9243243243243243, - "grad_norm": 2.9816982746124268, - "learning_rate": 4.896546338229945e-06, - "loss": 0.7984, - "step": 171 - }, - { - "epoch": 0.9297297297297298, - "grad_norm": 2.415736675262451, - "learning_rate": 4.8953342459481034e-06, - "loss": 0.6109, - "step": 172 - }, - { - "epoch": 0.9351351351351351, - "grad_norm": 2.740518808364868, - "learning_rate": 4.894115246143768e-06, - "loss": 0.8126, - "step": 173 - }, - { - "epoch": 0.9405405405405406, - "grad_norm": 2.7610201835632324, - "learning_rate": 4.892889342332218e-06, - "loss": 0.6862, - "step": 174 - }, - { - "epoch": 0.9459459459459459, - "grad_norm": 3.057025194168091, - "learning_rate": 4.891656538048642e-06, - "loss": 0.9895, - "step": 175 - }, - { - "epoch": 0.9513513513513514, - "grad_norm": 2.569751262664795, - "learning_rate": 4.890416836848128e-06, - "loss": 0.8481, - "step": 176 - }, - { - "epoch": 0.9567567567567568, - "grad_norm": 2.4443397521972656, - "learning_rate": 4.889170242305652e-06, - "loss": 0.6478, - "step": 177 - }, - { - "epoch": 0.9621621621621622, - "grad_norm": 2.5009846687316895, - "learning_rate": 4.887916758016069e-06, - "loss": 0.9714, - "step": 178 - }, - { - "epoch": 0.9675675675675676, - "grad_norm": 3.101975202560425, - "learning_rate": 4.886656387594104e-06, - "loss": 1.1264, - "step": 179 - }, - { - "epoch": 0.972972972972973, - "grad_norm": 2.6144704818725586, - "learning_rate": 4.885389134674338e-06, - "loss": 0.7664, - "step": 180 - }, - { - "epoch": 0.9783783783783784, - "grad_norm": 2.5834381580352783, - "learning_rate": 4.884115002911197e-06, - "loss": 0.6131, - "step": 181 - }, - { - "epoch": 0.9837837837837838, - "grad_norm": 2.5378055572509766, - "learning_rate": 4.88283399597895e-06, - "loss": 0.8733, - "step": 182 - }, - { - "epoch": 0.9891891891891892, - "grad_norm": 2.4095377922058105, - "learning_rate": 4.881546117571686e-06, - "loss": 0.643, - "step": 183 - }, - { - "epoch": 0.9945945945945946, - "grad_norm": 2.9554507732391357, - "learning_rate": 4.8802513714033135e-06, - "loss": 0.7287, - "step": 184 - }, - { - "epoch": 1.0, - "grad_norm": 2.8279213905334473, - "learning_rate": 4.878949761207545e-06, - "loss": 0.9927, - "step": 185 - }, - { - "epoch": 1.0054054054054054, - "grad_norm": 2.9361412525177, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.66, - "step": 186 - }, - { - "epoch": 1.0108108108108107, - "grad_norm": 3.392244338989258, - "learning_rate": 4.876325963767623e-06, - "loss": 0.594, - "step": 187 - }, - { - "epoch": 1.0162162162162163, - "grad_norm": 2.6276044845581055, - "learning_rate": 4.875003784089822e-06, - "loss": 0.5825, - "step": 188 - }, - { - "epoch": 1.0216216216216216, - "grad_norm": 2.2875545024871826, - "learning_rate": 4.873674755517305e-06, - "loss": 0.6594, - "step": 189 - }, - { - "epoch": 1.027027027027027, - "grad_norm": 2.8086795806884766, - "learning_rate": 4.872338881882645e-06, - "loss": 0.7536, - "step": 190 - }, - { - "epoch": 1.0324324324324325, - "grad_norm": 2.3685200214385986, - "learning_rate": 4.870996167038154e-06, - "loss": 0.4849, - "step": 191 - }, - { - "epoch": 1.037837837837838, - "grad_norm": 3.0264766216278076, - "learning_rate": 4.869646614855877e-06, - "loss": 0.3771, - "step": 192 - }, - { - "epoch": 1.0432432432432432, - "grad_norm": 4.335122108459473, - "learning_rate": 4.868290229227567e-06, - "loss": 0.8545, - "step": 193 - }, - { - "epoch": 1.0486486486486486, - "grad_norm": 3.442172050476074, - "learning_rate": 4.866927014064692e-06, - "loss": 0.3698, - "step": 194 - }, - { - "epoch": 1.054054054054054, - "grad_norm": 3.326539993286133, - "learning_rate": 4.86555697329841e-06, - "loss": 0.8468, - "step": 195 - }, - { - "epoch": 1.0594594594594595, - "grad_norm": 3.0372447967529297, - "learning_rate": 4.864180110879562e-06, - "loss": 0.8232, - "step": 196 - }, - { - "epoch": 1.0648648648648649, - "grad_norm": 2.955343008041382, - "learning_rate": 4.862796430778663e-06, - "loss": 0.4097, - "step": 197 - }, - { - "epoch": 1.0702702702702702, - "grad_norm": 2.4095399379730225, - "learning_rate": 4.861405936985889e-06, - "loss": 0.6746, - "step": 198 - }, - { - "epoch": 1.0756756756756758, - "grad_norm": 2.763500452041626, - "learning_rate": 4.860008633511059e-06, - "loss": 0.6605, - "step": 199 - }, - { - "epoch": 1.0810810810810811, - "grad_norm": 2.6751155853271484, - "learning_rate": 4.8586045243836384e-06, - "loss": 0.471, - "step": 200 - }, - { - "epoch": 1.0864864864864865, - "grad_norm": 3.3507862091064453, - "learning_rate": 4.857193613652711e-06, - "loss": 0.7665, - "step": 201 - }, - { - "epoch": 1.0918918918918918, - "grad_norm": 3.3064827919006348, - "learning_rate": 4.8557759053869775e-06, - "loss": 0.6436, - "step": 202 - }, - { - "epoch": 1.0972972972972972, - "grad_norm": 2.571828603744507, - "learning_rate": 4.854351403674741e-06, - "loss": 0.4642, - "step": 203 - }, - { - "epoch": 1.1027027027027028, - "grad_norm": 2.883220911026001, - "learning_rate": 4.852920112623895e-06, - "loss": 0.5737, - "step": 204 - }, - { - "epoch": 1.1081081081081081, - "grad_norm": 3.026144027709961, - "learning_rate": 4.851482036361912e-06, - "loss": 0.7302, - "step": 205 - }, - { - "epoch": 1.1135135135135135, - "grad_norm": 2.6689612865448, - "learning_rate": 4.850037179035829e-06, - "loss": 0.5229, - "step": 206 - }, - { - "epoch": 1.118918918918919, - "grad_norm": 2.4019956588745117, - "learning_rate": 4.8485855448122425e-06, - "loss": 0.5529, - "step": 207 - }, - { - "epoch": 1.1243243243243244, - "grad_norm": 2.3546230792999268, - "learning_rate": 4.847127137877286e-06, - "loss": 0.3635, - "step": 208 - }, - { - "epoch": 1.1297297297297297, - "grad_norm": 2.999096393585205, - "learning_rate": 4.8456619624366285e-06, - "loss": 0.8149, - "step": 209 - }, - { - "epoch": 1.135135135135135, - "grad_norm": 10.072900772094727, - "learning_rate": 4.844190022715456e-06, - "loss": 0.8333, - "step": 210 - }, - { - "epoch": 1.1405405405405404, - "grad_norm": 2.222123384475708, - "learning_rate": 4.84271132295846e-06, - "loss": 0.3717, - "step": 211 - }, - { - "epoch": 1.145945945945946, - "grad_norm": 2.8751113414764404, - "learning_rate": 4.841225867429826e-06, - "loss": 0.5994, - "step": 212 - }, - { - "epoch": 1.1513513513513514, - "grad_norm": 2.9580111503601074, - "learning_rate": 4.839733660413224e-06, - "loss": 0.8382, - "step": 213 - }, - { - "epoch": 1.1567567567567567, - "grad_norm": 4.628892421722412, - "learning_rate": 4.838234706211792e-06, - "loss": 0.818, - "step": 214 - }, - { - "epoch": 1.1621621621621623, - "grad_norm": 2.5103509426116943, - "learning_rate": 4.836729009148124e-06, - "loss": 0.4267, - "step": 215 - }, - { - "epoch": 1.1675675675675676, - "grad_norm": 2.6093738079071045, - "learning_rate": 4.835216573564261e-06, - "loss": 0.3472, - "step": 216 - }, - { - "epoch": 1.172972972972973, - "grad_norm": 3.0792338848114014, - "learning_rate": 4.833697403821672e-06, - "loss": 0.6323, - "step": 217 - }, - { - "epoch": 1.1783783783783783, - "grad_norm": 2.845163345336914, - "learning_rate": 4.8321715043012516e-06, - "loss": 0.6831, - "step": 218 - }, - { - "epoch": 1.1837837837837837, - "grad_norm": 3.0433948040008545, - "learning_rate": 4.830638879403296e-06, - "loss": 0.3682, - "step": 219 - }, - { - "epoch": 1.1891891891891893, - "grad_norm": 2.6533594131469727, - "learning_rate": 4.8290995335475e-06, - "loss": 0.4154, - "step": 220 - }, - { - "epoch": 1.1945945945945946, - "grad_norm": 2.9271352291107178, - "learning_rate": 4.827553471172935e-06, - "loss": 0.3991, - "step": 221 - }, - { - "epoch": 1.2, - "grad_norm": 2.9243528842926025, - "learning_rate": 4.826000696738045e-06, - "loss": 0.4538, - "step": 222 - }, - { - "epoch": 1.2054054054054055, - "grad_norm": 2.537332534790039, - "learning_rate": 4.824441214720629e-06, - "loss": 0.7692, - "step": 223 - }, - { - "epoch": 1.2108108108108109, - "grad_norm": 3.9193246364593506, - "learning_rate": 4.8228750296178275e-06, - "loss": 0.6038, - "step": 224 - }, - { - "epoch": 1.2162162162162162, - "grad_norm": 2.6646728515625, - "learning_rate": 4.821302145946113e-06, - "loss": 0.4147, - "step": 225 - }, - { - "epoch": 1.2216216216216216, - "grad_norm": 2.6519482135772705, - "learning_rate": 4.819722568241274e-06, - "loss": 0.5398, - "step": 226 - }, - { - "epoch": 1.227027027027027, - "grad_norm": 2.2018048763275146, - "learning_rate": 4.818136301058401e-06, - "loss": 0.3864, - "step": 227 - }, - { - "epoch": 1.2324324324324325, - "grad_norm": 2.5660712718963623, - "learning_rate": 4.816543348971879e-06, - "loss": 0.5712, - "step": 228 - }, - { - "epoch": 1.2378378378378379, - "grad_norm": 3.237663745880127, - "learning_rate": 4.814943716575368e-06, - "loss": 0.662, - "step": 229 - }, - { - "epoch": 1.2432432432432432, - "grad_norm": 2.5570430755615234, - "learning_rate": 4.813337408481793e-06, - "loss": 0.8661, - "step": 230 - }, - { - "epoch": 1.2486486486486488, - "grad_norm": 2.9231269359588623, - "learning_rate": 4.811724429323329e-06, - "loss": 0.9218, - "step": 231 - }, - { - "epoch": 1.2540540540540541, - "grad_norm": 3.637084722518921, - "learning_rate": 4.810104783751389e-06, - "loss": 0.5597, - "step": 232 - }, - { - "epoch": 1.2594594594594595, - "grad_norm": 3.0218842029571533, - "learning_rate": 4.8084784764366125e-06, - "loss": 0.4786, - "step": 233 - }, - { - "epoch": 1.2648648648648648, - "grad_norm": 2.770214080810547, - "learning_rate": 4.806845512068846e-06, - "loss": 0.5219, - "step": 234 - }, - { - "epoch": 1.2702702702702702, - "grad_norm": 3.093053102493286, - "learning_rate": 4.805205895357137e-06, - "loss": 0.643, - "step": 235 - }, - { - "epoch": 1.2756756756756757, - "grad_norm": 2.6373348236083984, - "learning_rate": 4.803559631029713e-06, - "loss": 0.5858, - "step": 236 - }, - { - "epoch": 1.281081081081081, - "grad_norm": 2.452030897140503, - "learning_rate": 4.801906723833973e-06, - "loss": 0.4185, - "step": 237 - }, - { - "epoch": 1.2864864864864864, - "grad_norm": 2.72564697265625, - "learning_rate": 4.8002471785364734e-06, - "loss": 0.4917, - "step": 238 - }, - { - "epoch": 1.291891891891892, - "grad_norm": 3.0389158725738525, - "learning_rate": 4.798580999922913e-06, - "loss": 0.645, - "step": 239 - }, - { - "epoch": 1.2972972972972974, - "grad_norm": 3.7002289295196533, - "learning_rate": 4.796908192798117e-06, - "loss": 0.5378, - "step": 240 - }, - { - "epoch": 1.3027027027027027, - "grad_norm": 2.1876111030578613, - "learning_rate": 4.7952287619860276e-06, - "loss": 0.5197, - "step": 241 - }, - { - "epoch": 1.308108108108108, - "grad_norm": 3.903337240219116, - "learning_rate": 4.793542712329689e-06, - "loss": 1.0226, - "step": 242 - }, - { - "epoch": 1.3135135135135134, - "grad_norm": 2.3623552322387695, - "learning_rate": 4.791850048691228e-06, - "loss": 0.5502, - "step": 243 - }, - { - "epoch": 1.318918918918919, - "grad_norm": 3.0669031143188477, - "learning_rate": 4.79015077595185e-06, - "loss": 0.6976, - "step": 244 - }, - { - "epoch": 1.3243243243243243, - "grad_norm": 3.1480472087860107, - "learning_rate": 4.788444899011816e-06, - "loss": 0.4795, - "step": 245 - }, - { - "epoch": 1.3297297297297297, - "grad_norm": 3.7051920890808105, - "learning_rate": 4.786732422790432e-06, - "loss": 0.6526, - "step": 246 - }, - { - "epoch": 1.3351351351351353, - "grad_norm": 3.4358389377593994, - "learning_rate": 4.785013352226036e-06, - "loss": 0.5551, - "step": 247 - }, - { - "epoch": 1.3405405405405406, - "grad_norm": 2.3789355754852295, - "learning_rate": 4.7832876922759805e-06, - "loss": 0.3151, - "step": 248 - }, - { - "epoch": 1.345945945945946, - "grad_norm": 2.4843716621398926, - "learning_rate": 4.781555447916622e-06, - "loss": 0.6713, - "step": 249 - }, - { - "epoch": 1.3513513513513513, - "grad_norm": 3.0176303386688232, - "learning_rate": 4.779816624143302e-06, - "loss": 0.437, - "step": 250 - }, - { - "epoch": 1.3567567567567567, - "grad_norm": 2.868350028991699, - "learning_rate": 4.77807122597034e-06, - "loss": 0.7632, - "step": 251 - }, - { - "epoch": 1.3621621621621622, - "grad_norm": 2.4629738330841064, - "learning_rate": 4.776319258431009e-06, - "loss": 0.4894, - "step": 252 - }, - { - "epoch": 1.3675675675675676, - "grad_norm": 2.798297882080078, - "learning_rate": 4.77456072657753e-06, - "loss": 0.4456, - "step": 253 - }, - { - "epoch": 1.372972972972973, - "grad_norm": 3.2977547645568848, - "learning_rate": 4.772795635481053e-06, - "loss": 0.5381, - "step": 254 - }, - { - "epoch": 1.3783783783783785, - "grad_norm": 4.1061906814575195, - "learning_rate": 4.77102399023164e-06, - "loss": 1.0302, - "step": 255 - }, - { - "epoch": 1.3837837837837839, - "grad_norm": 3.943284511566162, - "learning_rate": 4.769245795938261e-06, - "loss": 0.4875, - "step": 256 - }, - { - "epoch": 1.3891891891891892, - "grad_norm": 2.6420533657073975, - "learning_rate": 4.767461057728763e-06, - "loss": 0.4923, - "step": 257 - }, - { - "epoch": 1.3945945945945946, - "grad_norm": 3.3152263164520264, - "learning_rate": 4.76566978074987e-06, - "loss": 0.6699, - "step": 258 - }, - { - "epoch": 1.4, - "grad_norm": 2.6928882598876953, - "learning_rate": 4.7638719701671586e-06, - "loss": 0.6117, - "step": 259 - }, - { - "epoch": 1.4054054054054055, - "grad_norm": 2.706597328186035, - "learning_rate": 4.762067631165049e-06, - "loss": 0.8534, - "step": 260 - }, - { - "epoch": 1.4108108108108108, - "grad_norm": 2.9912848472595215, - "learning_rate": 4.760256768946787e-06, - "loss": 0.5057, - "step": 261 - }, - { - "epoch": 1.4162162162162162, - "grad_norm": 2.7098443508148193, - "learning_rate": 4.758439388734429e-06, - "loss": 0.7286, - "step": 262 - }, - { - "epoch": 1.4216216216216218, - "grad_norm": 3.1288092136383057, - "learning_rate": 4.7566154957688276e-06, - "loss": 0.9827, - "step": 263 - }, - { - "epoch": 1.427027027027027, - "grad_norm": 3.0505919456481934, - "learning_rate": 4.754785095309617e-06, - "loss": 0.7042, - "step": 264 - }, - { - "epoch": 1.4324324324324325, - "grad_norm": 2.6800339221954346, - "learning_rate": 4.752948192635199e-06, - "loss": 0.5179, - "step": 265 - }, - { - "epoch": 1.4378378378378378, - "grad_norm": 2.2246861457824707, - "learning_rate": 4.751104793042722e-06, - "loss": 0.8527, - "step": 266 - }, - { - "epoch": 1.4432432432432432, - "grad_norm": 2.4242751598358154, - "learning_rate": 4.7492549018480725e-06, - "loss": 0.5627, - "step": 267 - }, - { - "epoch": 1.4486486486486487, - "grad_norm": 2.763244152069092, - "learning_rate": 4.747398524385858e-06, - "loss": 0.8981, - "step": 268 - }, - { - "epoch": 1.454054054054054, - "grad_norm": 2.856595993041992, - "learning_rate": 4.745535666009389e-06, - "loss": 0.5455, - "step": 269 - }, - { - "epoch": 1.4594594594594594, - "grad_norm": 2.4168624877929688, - "learning_rate": 4.743666332090664e-06, - "loss": 0.4348, - "step": 270 - }, - { - "epoch": 1.464864864864865, - "grad_norm": 2.5408060550689697, - "learning_rate": 4.74179052802036e-06, - "loss": 0.5524, - "step": 271 - }, - { - "epoch": 1.4702702702702704, - "grad_norm": 2.6216673851013184, - "learning_rate": 4.739908259207807e-06, - "loss": 0.7469, - "step": 272 - }, - { - "epoch": 1.4756756756756757, - "grad_norm": 5.397300720214844, - "learning_rate": 4.738019531080981e-06, - "loss": 0.7216, - "step": 273 - }, - { - "epoch": 1.481081081081081, - "grad_norm": 3.3481080532073975, - "learning_rate": 4.7361243490864825e-06, - "loss": 0.7527, - "step": 274 - }, - { - "epoch": 1.4864864864864864, - "grad_norm": 2.7943873405456543, - "learning_rate": 4.734222718689527e-06, - "loss": 0.7437, - "step": 275 - }, - { - "epoch": 1.491891891891892, - "grad_norm": 2.206890344619751, - "learning_rate": 4.732314645373922e-06, - "loss": 0.5187, - "step": 276 - }, - { - "epoch": 1.4972972972972973, - "grad_norm": 2.76442813873291, - "learning_rate": 4.730400134642055e-06, - "loss": 0.7186, - "step": 277 - }, - { - "epoch": 1.5027027027027027, - "grad_norm": 3.4754087924957275, - "learning_rate": 4.728479192014879e-06, - "loss": 0.9655, - "step": 278 - }, - { - "epoch": 1.5081081081081082, - "grad_norm": 2.923779249191284, - "learning_rate": 4.726551823031895e-06, - "loss": 0.6251, - "step": 279 - }, - { - "epoch": 1.5135135135135136, - "grad_norm": 3.1142773628234863, - "learning_rate": 4.7246180332511335e-06, - "loss": 0.4805, - "step": 280 - }, - { - "epoch": 1.518918918918919, - "grad_norm": 2.3477070331573486, - "learning_rate": 4.722677828249142e-06, - "loss": 1.0939, - "step": 281 - }, - { - "epoch": 1.5243243243243243, - "grad_norm": 2.8418569564819336, - "learning_rate": 4.720731213620972e-06, - "loss": 0.9485, - "step": 282 - }, - { - "epoch": 1.5297297297297296, - "grad_norm": 2.462710380554199, - "learning_rate": 4.718778194980152e-06, - "loss": 0.5805, - "step": 283 - }, - { - "epoch": 1.535135135135135, - "grad_norm": 3.2379209995269775, - "learning_rate": 4.7168187779586805e-06, - "loss": 0.77, - "step": 284 - }, - { - "epoch": 1.5405405405405406, - "grad_norm": 3.0701661109924316, - "learning_rate": 4.71485296820701e-06, - "loss": 0.5932, - "step": 285 - }, - { - "epoch": 1.545945945945946, - "grad_norm": 4.099547386169434, - "learning_rate": 4.7128807713940245e-06, - "loss": 0.6296, - "step": 286 - }, - { - "epoch": 1.5513513513513515, - "grad_norm": 2.5529167652130127, - "learning_rate": 4.710902193207028e-06, - "loss": 0.6201, - "step": 287 - }, - { - "epoch": 1.5567567567567568, - "grad_norm": 2.794926881790161, - "learning_rate": 4.708917239351727e-06, - "loss": 0.5682, - "step": 288 - }, - { - "epoch": 1.5621621621621622, - "grad_norm": 3.2522501945495605, - "learning_rate": 4.706925915552214e-06, - "loss": 0.8877, - "step": 289 - }, - { - "epoch": 1.5675675675675675, - "grad_norm": 2.811847448348999, - "learning_rate": 4.704928227550949e-06, - "loss": 0.6521, - "step": 290 - }, - { - "epoch": 1.572972972972973, - "grad_norm": 2.7060673236846924, - "learning_rate": 4.702924181108745e-06, - "loss": 0.4929, - "step": 291 - }, - { - "epoch": 1.5783783783783782, - "grad_norm": 2.5009031295776367, - "learning_rate": 4.700913782004755e-06, - "loss": 0.4515, - "step": 292 - }, - { - "epoch": 1.5837837837837838, - "grad_norm": 2.6722700595855713, - "learning_rate": 4.698897036036446e-06, - "loss": 0.5477, - "step": 293 - }, - { - "epoch": 1.5891891891891892, - "grad_norm": 3.3333957195281982, - "learning_rate": 4.696873949019591e-06, - "loss": 0.9589, - "step": 294 - }, - { - "epoch": 1.5945945945945947, - "grad_norm": 2.4862897396087646, - "learning_rate": 4.694844526788248e-06, - "loss": 0.4425, - "step": 295 - }, - { - "epoch": 1.6, - "grad_norm": 2.78708553314209, - "learning_rate": 4.692808775194745e-06, - "loss": 0.4899, - "step": 296 - }, - { - "epoch": 1.6054054054054054, - "grad_norm": 2.9121289253234863, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4884, - "step": 297 - }, - { - "epoch": 1.6108108108108108, - "grad_norm": 4.692054271697998, - "learning_rate": 4.688718307421807e-06, - "loss": 0.8977, - "step": 298 - }, - { - "epoch": 1.6162162162162161, - "grad_norm": 3.1290926933288574, - "learning_rate": 4.686663603038222e-06, - "loss": 0.6833, - "step": 299 - }, - { - "epoch": 1.6216216216216215, - "grad_norm": 3.5091123580932617, - "learning_rate": 4.6846025928841365e-06, - "loss": 0.9141, - "step": 300 - }, - { - "epoch": 1.627027027027027, - "grad_norm": 2.5466184616088867, - "learning_rate": 4.6825352829029705e-06, - "loss": 0.5121, - "step": 301 - }, - { - "epoch": 1.6324324324324324, - "grad_norm": 2.7833092212677, - "learning_rate": 4.68046167905631e-06, - "loss": 0.5399, - "step": 302 - }, - { - "epoch": 1.637837837837838, - "grad_norm": 3.05135440826416, - "learning_rate": 4.678381787323889e-06, - "loss": 0.7921, - "step": 303 - }, - { - "epoch": 1.6432432432432433, - "grad_norm": 2.2391726970672607, - "learning_rate": 4.676295613703577e-06, - "loss": 0.7178, - "step": 304 - }, - { - "epoch": 1.6486486486486487, - "grad_norm": 2.3654022216796875, - "learning_rate": 4.674203164211357e-06, - "loss": 0.7162, - "step": 305 - }, - { - "epoch": 1.654054054054054, - "grad_norm": 2.436009645462036, - "learning_rate": 4.67210444488131e-06, - "loss": 0.6539, - "step": 306 - }, - { - "epoch": 1.6594594594594594, - "grad_norm": 2.6034209728240967, - "learning_rate": 4.669999461765599e-06, - "loss": 0.7214, - "step": 307 - }, - { - "epoch": 1.6648648648648647, - "grad_norm": 2.804229497909546, - "learning_rate": 4.6678882209344474e-06, - "loss": 0.7451, - "step": 308 - }, - { - "epoch": 1.6702702702702703, - "grad_norm": 2.6239655017852783, - "learning_rate": 4.665770728476127e-06, - "loss": 0.6464, - "step": 309 - }, - { - "epoch": 1.6756756756756757, - "grad_norm": 2.9320099353790283, - "learning_rate": 4.663646990496939e-06, - "loss": 0.6669, - "step": 310 - }, - { - "epoch": 1.6810810810810812, - "grad_norm": 3.09713077545166, - "learning_rate": 4.661517013121189e-06, - "loss": 0.8972, - "step": 311 - }, - { - "epoch": 1.6864864864864866, - "grad_norm": 3.6576132774353027, - "learning_rate": 4.659380802491181e-06, - "loss": 0.6286, - "step": 312 - }, - { - "epoch": 1.691891891891892, - "grad_norm": 2.9320433139801025, - "learning_rate": 4.6572383647671915e-06, - "loss": 0.3631, - "step": 313 - }, - { - "epoch": 1.6972972972972973, - "grad_norm": 3.399357557296753, - "learning_rate": 4.655089706127457e-06, - "loss": 0.5682, - "step": 314 - }, - { - "epoch": 1.7027027027027026, - "grad_norm": 2.7667412757873535, - "learning_rate": 4.652934832768148e-06, - "loss": 0.5457, - "step": 315 - }, - { - "epoch": 1.708108108108108, - "grad_norm": 2.3023321628570557, - "learning_rate": 4.650773750903363e-06, - "loss": 0.6601, - "step": 316 - }, - { - "epoch": 1.7135135135135136, - "grad_norm": 2.6584670543670654, - "learning_rate": 4.6486064667651005e-06, - "loss": 0.5882, - "step": 317 - }, - { - "epoch": 1.718918918918919, - "grad_norm": 5.528168678283691, - "learning_rate": 4.646432986603245e-06, - "loss": 0.7628, - "step": 318 - }, - { - "epoch": 1.7243243243243245, - "grad_norm": 3.054884195327759, - "learning_rate": 4.644253316685552e-06, - "loss": 0.6877, - "step": 319 - }, - { - "epoch": 1.7297297297297298, - "grad_norm": 3.2672388553619385, - "learning_rate": 4.6420674632976205e-06, - "loss": 0.7026, - "step": 320 - }, - { - "epoch": 1.7351351351351352, - "grad_norm": 3.109384536743164, - "learning_rate": 4.639875432742886e-06, - "loss": 0.5236, - "step": 321 - }, - { - "epoch": 1.7405405405405405, - "grad_norm": 3.3593883514404297, - "learning_rate": 4.6376772313425975e-06, - "loss": 0.6463, - "step": 322 - }, - { - "epoch": 1.7459459459459459, - "grad_norm": 2.6352698802948, - "learning_rate": 4.635472865435795e-06, - "loss": 0.6903, - "step": 323 - }, - { - "epoch": 1.7513513513513512, - "grad_norm": 2.751690149307251, - "learning_rate": 4.6332623413792995e-06, - "loss": 0.7342, - "step": 324 - }, - { - "epoch": 1.7567567567567568, - "grad_norm": 2.670915126800537, - "learning_rate": 4.6310456655476874e-06, - "loss": 0.4302, - "step": 325 - }, - { - "epoch": 1.7621621621621621, - "grad_norm": 2.7648138999938965, - "learning_rate": 4.6288228443332786e-06, - "loss": 0.5108, - "step": 326 - }, - { - "epoch": 1.7675675675675677, - "grad_norm": 2.7451536655426025, - "learning_rate": 4.626593884146111e-06, - "loss": 0.7646, - "step": 327 - }, - { - "epoch": 1.772972972972973, - "grad_norm": 2.4656403064727783, - "learning_rate": 4.624358791413928e-06, - "loss": 0.5529, - "step": 328 - }, - { - "epoch": 1.7783783783783784, - "grad_norm": 2.5987517833709717, - "learning_rate": 4.622117572582159e-06, - "loss": 0.609, - "step": 329 - }, - { - "epoch": 1.7837837837837838, - "grad_norm": 3.3843371868133545, - "learning_rate": 4.619870234113894e-06, - "loss": 0.9146, - "step": 330 - }, - { - "epoch": 1.7891891891891891, - "grad_norm": 2.3542068004608154, - "learning_rate": 4.617616782489878e-06, - "loss": 0.6887, - "step": 331 - }, - { - "epoch": 1.7945945945945945, - "grad_norm": 2.2049715518951416, - "learning_rate": 4.615357224208477e-06, - "loss": 0.505, - "step": 332 - }, - { - "epoch": 1.8, - "grad_norm": 2.453920364379883, - "learning_rate": 4.613091565785674e-06, - "loss": 0.8384, - "step": 333 - }, - { - "epoch": 1.8054054054054054, - "grad_norm": 2.5751583576202393, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5512, - "step": 334 - }, - { - "epoch": 1.810810810810811, - "grad_norm": 2.524075984954834, - "learning_rate": 4.608541974667714e-06, - "loss": 0.4877, - "step": 335 - }, - { - "epoch": 1.8162162162162163, - "grad_norm": 2.2856955528259277, - "learning_rate": 4.606258055092397e-06, - "loss": 0.5583, - "step": 336 - }, - { - "epoch": 1.8216216216216217, - "grad_norm": 2.2773683071136475, - "learning_rate": 4.603968061615321e-06, - "loss": 0.5421, - "step": 337 - }, - { - "epoch": 1.827027027027027, - "grad_norm": 4.085512161254883, - "learning_rate": 4.601672000840231e-06, - "loss": 0.942, - "step": 338 - }, - { - "epoch": 1.8324324324324324, - "grad_norm": 2.3710968494415283, - "learning_rate": 4.5993698793883715e-06, - "loss": 0.3773, - "step": 339 - }, - { - "epoch": 1.8378378378378377, - "grad_norm": 2.745534658432007, - "learning_rate": 4.597061703898462e-06, - "loss": 0.9694, - "step": 340 - }, - { - "epoch": 1.8432432432432433, - "grad_norm": 2.463207244873047, - "learning_rate": 4.594747481026685e-06, - "loss": 0.4667, - "step": 341 - }, - { - "epoch": 1.8486486486486486, - "grad_norm": 2.7216601371765137, - "learning_rate": 4.592427217446656e-06, - "loss": 0.4267, - "step": 342 - }, - { - "epoch": 1.8540540540540542, - "grad_norm": 2.545664072036743, - "learning_rate": 4.590100919849413e-06, - "loss": 0.9245, - "step": 343 - }, - { - "epoch": 1.8594594594594596, - "grad_norm": 3.692840337753296, - "learning_rate": 4.587768594943396e-06, - "loss": 0.7502, - "step": 344 - }, - { - "epoch": 1.864864864864865, - "grad_norm": 2.993229627609253, - "learning_rate": 4.585430249454426e-06, - "loss": 0.4689, - "step": 345 - }, - { - "epoch": 1.8702702702702703, - "grad_norm": 2.162867546081543, - "learning_rate": 4.583085890125682e-06, - "loss": 0.6188, - "step": 346 - }, - { - "epoch": 1.8756756756756756, - "grad_norm": 2.2169792652130127, - "learning_rate": 4.5807355237176896e-06, - "loss": 0.6352, - "step": 347 - }, - { - "epoch": 1.881081081081081, - "grad_norm": 3.978985548019409, - "learning_rate": 4.578379157008296e-06, - "loss": 0.464, - "step": 348 - }, - { - "epoch": 1.8864864864864865, - "grad_norm": 2.236682653427124, - "learning_rate": 4.57601679679265e-06, - "loss": 0.5943, - "step": 349 - }, - { - "epoch": 1.8918918918918919, - "grad_norm": 2.528754472732544, - "learning_rate": 4.573648449883188e-06, - "loss": 0.6949, - "step": 350 - }, - { - "epoch": 1.8972972972972975, - "grad_norm": 2.7673721313476562, - "learning_rate": 4.571274123109606e-06, - "loss": 0.4333, - "step": 351 - }, - { - "epoch": 1.9027027027027028, - "grad_norm": 2.698012351989746, - "learning_rate": 4.568893823318847e-06, - "loss": 0.6796, - "step": 352 - }, - { - "epoch": 1.9081081081081082, - "grad_norm": 2.9640560150146484, - "learning_rate": 4.566507557375077e-06, - "loss": 0.6139, - "step": 353 - }, - { - "epoch": 1.9135135135135135, - "grad_norm": 2.417628526687622, - "learning_rate": 4.5641153321596684e-06, - "loss": 0.4515, - "step": 354 - }, - { - "epoch": 1.9189189189189189, - "grad_norm": 2.676739454269409, - "learning_rate": 4.56171715457118e-06, - "loss": 0.8426, - "step": 355 - }, - { - "epoch": 1.9243243243243242, - "grad_norm": 2.8428189754486084, - "learning_rate": 4.559313031525331e-06, - "loss": 0.5806, - "step": 356 - }, - { - "epoch": 1.9297297297297298, - "grad_norm": 2.6817944049835205, - "learning_rate": 4.55690296995499e-06, - "loss": 0.5927, - "step": 357 - }, - { - "epoch": 1.9351351351351351, - "grad_norm": 3.5939931869506836, - "learning_rate": 4.554486976810149e-06, - "loss": 0.9986, - "step": 358 - }, - { - "epoch": 1.9405405405405407, - "grad_norm": 2.86688494682312, - "learning_rate": 4.552065059057906e-06, - "loss": 0.6813, - "step": 359 - }, - { - "epoch": 1.945945945945946, - "grad_norm": 2.9295246601104736, - "learning_rate": 4.549637223682441e-06, - "loss": 1.0832, - "step": 360 - }, - { - "epoch": 1.9513513513513514, - "grad_norm": 2.6939451694488525, - "learning_rate": 4.547203477685005e-06, - "loss": 0.7377, - "step": 361 - }, - { - "epoch": 1.9567567567567568, - "grad_norm": 2.226055145263672, - "learning_rate": 4.544763828083888e-06, - "loss": 0.5412, - "step": 362 - }, - { - "epoch": 1.962162162162162, - "grad_norm": 2.490187406539917, - "learning_rate": 4.542318281914405e-06, - "loss": 0.6955, - "step": 363 - }, - { - "epoch": 1.9675675675675675, - "grad_norm": 2.9241302013397217, - "learning_rate": 4.53986684622888e-06, - "loss": 0.6774, - "step": 364 - }, - { - "epoch": 1.972972972972973, - "grad_norm": 2.988084554672241, - "learning_rate": 4.537409528096615e-06, - "loss": 0.5832, - "step": 365 - }, - { - "epoch": 1.9783783783783784, - "grad_norm": 2.9380626678466797, - "learning_rate": 4.534946334603879e-06, - "loss": 0.606, - "step": 366 - }, - { - "epoch": 1.983783783783784, - "grad_norm": 2.667588710784912, - "learning_rate": 4.532477272853882e-06, - "loss": 0.4991, - "step": 367 - }, - { - "epoch": 1.9891891891891893, - "grad_norm": 2.9711899757385254, - "learning_rate": 4.530002349966759e-06, - "loss": 0.4442, - "step": 368 - }, - { - "epoch": 1.9945945945945946, - "grad_norm": 3.443957805633545, - "learning_rate": 4.5275215730795445e-06, - "loss": 0.6566, - "step": 369 - }, - { - "epoch": 2.0, - "grad_norm": 3.590317487716675, - "learning_rate": 4.525034949346156e-06, - "loss": 0.5687, - "step": 370 - }, - { - "epoch": 2.0054054054054054, - "grad_norm": 3.678600549697876, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4458, - "step": 371 - }, - { - "epoch": 2.0108108108108107, - "grad_norm": 3.803563356399536, - "learning_rate": 4.5200441900408045e-06, - "loss": 0.4418, - "step": 372 - }, - { - "epoch": 2.016216216216216, - "grad_norm": 2.9187233448028564, - "learning_rate": 4.517540068860898e-06, - "loss": 0.7057, - "step": 373 - }, - { - "epoch": 2.0216216216216214, - "grad_norm": 2.693603515625, - "learning_rate": 4.515030129618884e-06, - "loss": 0.4491, - "step": 374 - }, - { - "epoch": 2.027027027027027, - "grad_norm": 2.3883047103881836, - "learning_rate": 4.512514379552779e-06, - "loss": 0.3571, - "step": 375 - }, - { - "epoch": 2.0324324324324325, - "grad_norm": 4.558557033538818, - "learning_rate": 4.509992825917352e-06, - "loss": 0.5056, - "step": 376 - }, - { - "epoch": 2.037837837837838, - "grad_norm": 3.9574761390686035, - "learning_rate": 4.507465475984109e-06, - "loss": 0.6834, - "step": 377 - }, - { - "epoch": 2.0432432432432432, - "grad_norm": 5.34630012512207, - "learning_rate": 4.504932337041272e-06, - "loss": 0.6726, - "step": 378 - }, - { - "epoch": 2.0486486486486486, - "grad_norm": 3.198740243911743, - "learning_rate": 4.502393416393757e-06, - "loss": 0.4032, - "step": 379 - }, - { - "epoch": 2.054054054054054, - "grad_norm": 3.347480297088623, - "learning_rate": 4.4998487213631515e-06, - "loss": 0.5442, - "step": 380 - }, - { - "epoch": 2.0594594594594593, - "grad_norm": 3.940531015396118, - "learning_rate": 4.497298259287696e-06, - "loss": 0.6181, - "step": 381 - }, - { - "epoch": 2.064864864864865, - "grad_norm": 3.0910496711730957, - "learning_rate": 4.494742037522261e-06, - "loss": 0.3829, - "step": 382 - }, - { - "epoch": 2.0702702702702704, - "grad_norm": 4.060451984405518, - "learning_rate": 4.4921800634383295e-06, - "loss": 0.4953, - "step": 383 - }, - { - "epoch": 2.075675675675676, - "grad_norm": 3.1667511463165283, - "learning_rate": 4.4896123444239655e-06, - "loss": 0.3254, - "step": 384 - }, - { - "epoch": 2.081081081081081, - "grad_norm": 3.0239670276641846, - "learning_rate": 4.487038887883809e-06, - "loss": 0.555, - "step": 385 - }, - { - "epoch": 2.0864864864864865, - "grad_norm": 2.8815383911132812, - "learning_rate": 4.484459701239038e-06, - "loss": 0.665, - "step": 386 - }, - { - "epoch": 2.091891891891892, - "grad_norm": 3.615537166595459, - "learning_rate": 4.481874791927358e-06, - "loss": 0.2652, - "step": 387 - }, - { - "epoch": 2.097297297297297, - "grad_norm": 3.407407283782959, - "learning_rate": 4.479284167402977e-06, - "loss": 0.3811, - "step": 388 - }, - { - "epoch": 2.1027027027027025, - "grad_norm": 2.6651623249053955, - "learning_rate": 4.476687835136585e-06, - "loss": 0.2463, - "step": 389 - }, - { - "epoch": 2.108108108108108, - "grad_norm": 3.5145862102508545, - "learning_rate": 4.47408580261533e-06, - "loss": 0.5507, - "step": 390 - }, - { - "epoch": 2.1135135135135137, - "grad_norm": 3.0952725410461426, - "learning_rate": 4.471478077342798e-06, - "loss": 0.288, - "step": 391 - }, - { - "epoch": 2.118918918918919, - "grad_norm": 2.634775400161743, - "learning_rate": 4.468864666838994e-06, - "loss": 0.5169, - "step": 392 - }, - { - "epoch": 2.1243243243243244, - "grad_norm": 3.7388594150543213, - "learning_rate": 4.4662455786403125e-06, - "loss": 0.3327, - "step": 393 - }, - { - "epoch": 2.1297297297297297, - "grad_norm": 3.8197360038757324, - "learning_rate": 4.463620820299528e-06, - "loss": 0.3877, - "step": 394 - }, - { - "epoch": 2.135135135135135, - "grad_norm": 3.0073485374450684, - "learning_rate": 4.4609903993857606e-06, - "loss": 0.5425, - "step": 395 - }, - { - "epoch": 2.1405405405405404, - "grad_norm": 2.6923868656158447, - "learning_rate": 4.458354323484462e-06, - "loss": 0.5257, - "step": 396 - }, - { - "epoch": 2.145945945945946, - "grad_norm": 3.2151331901550293, - "learning_rate": 4.45571260019739e-06, - "loss": 0.3914, - "step": 397 - }, - { - "epoch": 2.1513513513513516, - "grad_norm": 3.4031248092651367, - "learning_rate": 4.453065237142592e-06, - "loss": 0.3455, - "step": 398 - }, - { - "epoch": 2.156756756756757, - "grad_norm": 3.012275457382202, - "learning_rate": 4.4504122419543745e-06, - "loss": 0.4652, - "step": 399 - }, - { - "epoch": 2.1621621621621623, - "grad_norm": 3.3084208965301514, - "learning_rate": 4.4477536222832865e-06, - "loss": 0.6343, - "step": 400 - }, - { - "epoch": 2.1675675675675676, - "grad_norm": 3.115206241607666, - "learning_rate": 4.445089385796099e-06, - "loss": 0.6975, - "step": 401 - }, - { - "epoch": 2.172972972972973, - "grad_norm": 2.893930435180664, - "learning_rate": 4.442419540175778e-06, - "loss": 0.5779, - "step": 402 - }, - { - "epoch": 2.1783783783783783, - "grad_norm": 3.0549168586730957, - "learning_rate": 4.439744093121465e-06, - "loss": 0.4541, - "step": 403 - }, - { - "epoch": 2.1837837837837837, - "grad_norm": 3.1189024448394775, - "learning_rate": 4.437063052348457e-06, - "loss": 0.4078, - "step": 404 - }, - { - "epoch": 2.189189189189189, - "grad_norm": 6.644659042358398, - "learning_rate": 4.434376425588179e-06, - "loss": 0.6759, - "step": 405 - }, - { - "epoch": 2.1945945945945944, - "grad_norm": 2.807554006576538, - "learning_rate": 4.431684220588163e-06, - "loss": 0.2938, - "step": 406 - }, - { - "epoch": 2.2, - "grad_norm": 3.6900999546051025, - "learning_rate": 4.428986445112034e-06, - "loss": 0.676, - "step": 407 - }, - { - "epoch": 2.2054054054054055, - "grad_norm": 2.0721664428710938, - "learning_rate": 4.426283106939474e-06, - "loss": 0.1859, - "step": 408 - }, - { - "epoch": 2.210810810810811, - "grad_norm": 2.953388214111328, - "learning_rate": 4.423574213866209e-06, - "loss": 0.2955, - "step": 409 - }, - { - "epoch": 2.2162162162162162, - "grad_norm": 3.049050807952881, - "learning_rate": 4.420859773703985e-06, - "loss": 0.2262, - "step": 410 - }, - { - "epoch": 2.2216216216216216, - "grad_norm": 3.319796323776245, - "learning_rate": 4.418139794280542e-06, - "loss": 0.2273, - "step": 411 - }, - { - "epoch": 2.227027027027027, - "grad_norm": 2.4133522510528564, - "learning_rate": 4.415414283439595e-06, - "loss": 0.3282, - "step": 412 - }, - { - "epoch": 2.2324324324324323, - "grad_norm": 2.9842193126678467, - "learning_rate": 4.4126832490408116e-06, - "loss": 0.3651, - "step": 413 - }, - { - "epoch": 2.237837837837838, - "grad_norm": 2.759531259536743, - "learning_rate": 4.409946698959784e-06, - "loss": 0.4052, - "step": 414 - }, - { - "epoch": 2.2432432432432434, - "grad_norm": 3.045485019683838, - "learning_rate": 4.4072046410880145e-06, - "loss": 0.4638, - "step": 415 - }, - { - "epoch": 2.2486486486486488, - "grad_norm": 3.0058295726776123, - "learning_rate": 4.404457083332887e-06, - "loss": 0.517, - "step": 416 - }, - { - "epoch": 2.254054054054054, - "grad_norm": 3.025688409805298, - "learning_rate": 4.401704033617643e-06, - "loss": 0.6902, - "step": 417 - }, - { - "epoch": 2.2594594594594595, - "grad_norm": 3.3047802448272705, - "learning_rate": 4.398945499881366e-06, - "loss": 0.3552, - "step": 418 - }, - { - "epoch": 2.264864864864865, - "grad_norm": 3.0683655738830566, - "learning_rate": 4.396181490078949e-06, - "loss": 0.286, - "step": 419 - }, - { - "epoch": 2.27027027027027, - "grad_norm": 3.627681016921997, - "learning_rate": 4.393412012181082e-06, - "loss": 0.4036, - "step": 420 - }, - { - "epoch": 2.2756756756756755, - "grad_norm": 4.552238941192627, - "learning_rate": 4.390637074174219e-06, - "loss": 0.8037, - "step": 421 - }, - { - "epoch": 2.281081081081081, - "grad_norm": 2.8688855171203613, - "learning_rate": 4.387856684060561e-06, - "loss": 0.2553, - "step": 422 - }, - { - "epoch": 2.2864864864864867, - "grad_norm": 4.21850061416626, - "learning_rate": 4.385070849858033e-06, - "loss": 0.6222, - "step": 423 - }, - { - "epoch": 2.291891891891892, - "grad_norm": 3.038433790206909, - "learning_rate": 4.382279579600257e-06, - "loss": 0.5326, - "step": 424 - }, - { - "epoch": 2.2972972972972974, - "grad_norm": 3.297300338745117, - "learning_rate": 4.379482881336532e-06, - "loss": 0.5515, - "step": 425 - }, - { - "epoch": 2.3027027027027027, - "grad_norm": 7.162952423095703, - "learning_rate": 4.376680763131811e-06, - "loss": 0.6948, - "step": 426 - }, - { - "epoch": 2.308108108108108, - "grad_norm": 3.2403595447540283, - "learning_rate": 4.373873233066676e-06, - "loss": 0.2947, - "step": 427 - }, - { - "epoch": 2.3135135135135134, - "grad_norm": 3.2969906330108643, - "learning_rate": 4.371060299237315e-06, - "loss": 0.2261, - "step": 428 - }, - { - "epoch": 2.3189189189189188, - "grad_norm": 2.669058322906494, - "learning_rate": 4.368241969755499e-06, - "loss": 0.5398, - "step": 429 - }, - { - "epoch": 2.3243243243243246, - "grad_norm": 2.7643518447875977, - "learning_rate": 4.36541825274856e-06, - "loss": 0.3301, - "step": 430 - }, - { - "epoch": 2.32972972972973, - "grad_norm": 3.6037657260894775, - "learning_rate": 4.3625891563593635e-06, - "loss": 0.6064, - "step": 431 - }, - { - "epoch": 2.3351351351351353, - "grad_norm": 2.8805618286132812, - "learning_rate": 4.35975468874629e-06, - "loss": 0.3897, - "step": 432 - }, - { - "epoch": 2.3405405405405406, - "grad_norm": 2.642402172088623, - "learning_rate": 4.356914858083211e-06, - "loss": 0.271, - "step": 433 - }, - { - "epoch": 2.345945945945946, - "grad_norm": 2.916337490081787, - "learning_rate": 4.354069672559458e-06, - "loss": 0.3681, - "step": 434 - }, - { - "epoch": 2.3513513513513513, - "grad_norm": 3.3312325477600098, - "learning_rate": 4.35121914037981e-06, - "loss": 0.298, - "step": 435 - }, - { - "epoch": 2.3567567567567567, - "grad_norm": 2.980583906173706, - "learning_rate": 4.348363269764462e-06, - "loss": 0.3618, - "step": 436 - }, - { - "epoch": 2.362162162162162, - "grad_norm": 3.5010197162628174, - "learning_rate": 4.345502068949003e-06, - "loss": 0.8972, - "step": 437 - }, - { - "epoch": 2.3675675675675674, - "grad_norm": 2.7187814712524414, - "learning_rate": 4.342635546184394e-06, - "loss": 0.3939, - "step": 438 - }, - { - "epoch": 2.372972972972973, - "grad_norm": 2.8368170261383057, - "learning_rate": 4.339763709736944e-06, - "loss": 0.5462, - "step": 439 - }, - { - "epoch": 2.3783783783783785, - "grad_norm": 2.6989636421203613, - "learning_rate": 4.336886567888283e-06, - "loss": 0.5932, - "step": 440 - }, - { - "epoch": 2.383783783783784, - "grad_norm": 3.2514829635620117, - "learning_rate": 4.334004128935342e-06, - "loss": 0.4622, - "step": 441 - }, - { - "epoch": 2.389189189189189, - "grad_norm": 5.242766857147217, - "learning_rate": 4.331116401190327e-06, - "loss": 0.5997, - "step": 442 - }, - { - "epoch": 2.3945945945945946, - "grad_norm": 3.492724657058716, - "learning_rate": 4.328223392980696e-06, - "loss": 0.3072, - "step": 443 - }, - { - "epoch": 2.4, - "grad_norm": 4.074132442474365, - "learning_rate": 4.325325112649134e-06, - "loss": 0.5338, - "step": 444 - }, - { - "epoch": 2.4054054054054053, - "grad_norm": 2.7208468914031982, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3266, - "step": 445 - }, - { - "epoch": 2.410810810810811, - "grad_norm": 2.929180383682251, - "learning_rate": 4.3195127690669494e-06, - "loss": 0.4064, - "step": 446 - }, - { - "epoch": 2.4162162162162164, - "grad_norm": 2.848353624343872, - "learning_rate": 4.3165987225776186e-06, - "loss": 0.3856, - "step": 447 - }, - { - "epoch": 2.4216216216216218, - "grad_norm": 3.946488618850708, - "learning_rate": 4.313679437488889e-06, - "loss": 0.4261, - "step": 448 - }, - { - "epoch": 2.427027027027027, - "grad_norm": 5.781888961791992, - "learning_rate": 4.310754922219223e-06, - "loss": 0.4943, - "step": 449 - }, - { - "epoch": 2.4324324324324325, - "grad_norm": 2.8406941890716553, - "learning_rate": 4.307825185202164e-06, - "loss": 0.2874, - "step": 450 - }, - { - "epoch": 2.437837837837838, - "grad_norm": 3.2017335891723633, - "learning_rate": 4.3048902348863116e-06, - "loss": 0.4218, - "step": 451 - }, - { - "epoch": 2.443243243243243, - "grad_norm": 3.8355906009674072, - "learning_rate": 4.301950079735303e-06, - "loss": 0.4204, - "step": 452 - }, - { - "epoch": 2.4486486486486485, - "grad_norm": 4.783357620239258, - "learning_rate": 4.299004728227782e-06, - "loss": 0.5593, - "step": 453 - }, - { - "epoch": 2.454054054054054, - "grad_norm": 3.014080762863159, - "learning_rate": 4.2960541888573774e-06, - "loss": 0.4187, - "step": 454 - }, - { - "epoch": 2.4594594594594597, - "grad_norm": 3.5906598567962646, - "learning_rate": 4.29309847013268e-06, - "loss": 0.4193, - "step": 455 - }, - { - "epoch": 2.464864864864865, - "grad_norm": 3.9043331146240234, - "learning_rate": 4.290137580577216e-06, - "loss": 0.7035, - "step": 456 - }, - { - "epoch": 2.4702702702702704, - "grad_norm": 3.139753580093384, - "learning_rate": 4.287171528729423e-06, - "loss": 0.5877, - "step": 457 - }, - { - "epoch": 2.4756756756756757, - "grad_norm": 2.9091074466705322, - "learning_rate": 4.284200323142623e-06, - "loss": 0.5309, - "step": 458 - }, - { - "epoch": 2.481081081081081, - "grad_norm": 3.1253795623779297, - "learning_rate": 4.281223972385004e-06, - "loss": 0.448, - "step": 459 - }, - { - "epoch": 2.4864864864864864, - "grad_norm": 2.65510892868042, - "learning_rate": 4.27824248503959e-06, - "loss": 0.4453, - "step": 460 - }, - { - "epoch": 2.4918918918918918, - "grad_norm": 3.2135510444641113, - "learning_rate": 4.275255869704214e-06, - "loss": 0.5582, - "step": 461 - }, - { - "epoch": 2.4972972972972975, - "grad_norm": 2.452545404434204, - "learning_rate": 4.272264134991503e-06, - "loss": 0.423, - "step": 462 - }, - { - "epoch": 2.5027027027027025, - "grad_norm": 2.6370208263397217, - "learning_rate": 4.269267289528843e-06, - "loss": 0.271, - "step": 463 - }, - { - "epoch": 2.5081081081081082, - "grad_norm": 3.31266450881958, - "learning_rate": 4.266265341958356e-06, - "loss": 0.6459, - "step": 464 - }, - { - "epoch": 2.5135135135135136, - "grad_norm": 3.2743148803710938, - "learning_rate": 4.263258300936882e-06, - "loss": 0.2959, - "step": 465 - }, - { - "epoch": 2.518918918918919, - "grad_norm": 2.883549690246582, - "learning_rate": 4.260246175135948e-06, - "loss": 0.3418, - "step": 466 - }, - { - "epoch": 2.5243243243243243, - "grad_norm": 2.7019498348236084, - "learning_rate": 4.257228973241742e-06, - "loss": 0.3459, - "step": 467 - }, - { - "epoch": 2.5297297297297296, - "grad_norm": 3.8166959285736084, - "learning_rate": 4.254206703955092e-06, - "loss": 0.4769, - "step": 468 - }, - { - "epoch": 2.535135135135135, - "grad_norm": 3.264763593673706, - "learning_rate": 4.251179375991438e-06, - "loss": 0.6487, - "step": 469 - }, - { - "epoch": 2.5405405405405403, - "grad_norm": 2.7936933040618896, - "learning_rate": 4.248146998080808e-06, - "loss": 0.5547, - "step": 470 - }, - { - "epoch": 2.545945945945946, - "grad_norm": 3.21852707862854, - "learning_rate": 4.2451095789677945e-06, - "loss": 0.2965, - "step": 471 - }, - { - "epoch": 2.5513513513513515, - "grad_norm": 3.4528985023498535, - "learning_rate": 4.242067127411525e-06, - "loss": 0.3831, - "step": 472 - }, - { - "epoch": 2.556756756756757, - "grad_norm": 4.317023754119873, - "learning_rate": 4.239019652185642e-06, - "loss": 0.1756, - "step": 473 - }, - { - "epoch": 2.562162162162162, - "grad_norm": 3.677452325820923, - "learning_rate": 4.2359671620782725e-06, - "loss": 0.5136, - "step": 474 - }, - { - "epoch": 2.5675675675675675, - "grad_norm": 3.7563393115997314, - "learning_rate": 4.232909665892005e-06, - "loss": 0.6554, - "step": 475 - }, - { - "epoch": 2.572972972972973, - "grad_norm": 3.5125508308410645, - "learning_rate": 4.229847172443866e-06, - "loss": 0.3804, - "step": 476 - }, - { - "epoch": 2.5783783783783782, - "grad_norm": 2.8835806846618652, - "learning_rate": 4.2267796905652926e-06, - "loss": 0.3338, - "step": 477 - }, - { - "epoch": 2.583783783783784, - "grad_norm": 3.2136261463165283, - "learning_rate": 4.223707229102105e-06, - "loss": 0.6163, - "step": 478 - }, - { - "epoch": 2.589189189189189, - "grad_norm": 3.467475175857544, - "learning_rate": 4.220629796914487e-06, - "loss": 0.3005, - "step": 479 - }, - { - "epoch": 2.5945945945945947, - "grad_norm": 3.597490072250366, - "learning_rate": 4.217547402876954e-06, - "loss": 0.56, - "step": 480 - }, - { - "epoch": 2.6, - "grad_norm": 3.2377140522003174, - "learning_rate": 4.214460055878329e-06, - "loss": 0.4512, - "step": 481 - }, - { - "epoch": 2.6054054054054054, - "grad_norm": 2.577746868133545, - "learning_rate": 4.211367764821722e-06, - "loss": 0.3074, - "step": 482 - }, - { - "epoch": 2.610810810810811, - "grad_norm": 3.6584155559539795, - "learning_rate": 4.208270538624497e-06, - "loss": 0.6752, - "step": 483 - }, - { - "epoch": 2.616216216216216, - "grad_norm": 2.602778434753418, - "learning_rate": 4.205168386218251e-06, - "loss": 0.2347, - "step": 484 - }, - { - "epoch": 2.6216216216216215, - "grad_norm": 3.587503433227539, - "learning_rate": 4.2020613165487865e-06, - "loss": 0.5189, - "step": 485 - }, - { - "epoch": 2.627027027027027, - "grad_norm": 3.9341986179351807, - "learning_rate": 4.198949338576086e-06, - "loss": 0.7739, - "step": 486 - }, - { - "epoch": 2.6324324324324326, - "grad_norm": 2.9211957454681396, - "learning_rate": 4.1958324612742875e-06, - "loss": 0.3495, - "step": 487 - }, - { - "epoch": 2.637837837837838, - "grad_norm": 3.29193115234375, - "learning_rate": 4.1927106936316564e-06, - "loss": 0.2257, - "step": 488 - }, - { - "epoch": 2.6432432432432433, - "grad_norm": 3.3687057495117188, - "learning_rate": 4.189584044650559e-06, - "loss": 0.6708, - "step": 489 - }, - { - "epoch": 2.6486486486486487, - "grad_norm": 3.096428155899048, - "learning_rate": 4.186452523347441e-06, - "loss": 0.3126, - "step": 490 - }, - { - "epoch": 2.654054054054054, - "grad_norm": 3.0865559577941895, - "learning_rate": 4.183316138752799e-06, - "loss": 0.4219, - "step": 491 - }, - { - "epoch": 2.6594594594594594, - "grad_norm": 3.389827013015747, - "learning_rate": 4.180174899911149e-06, - "loss": 0.3937, - "step": 492 - }, - { - "epoch": 2.6648648648648647, - "grad_norm": 3.044360637664795, - "learning_rate": 4.177028815881012e-06, - "loss": 0.4098, - "step": 493 - }, - { - "epoch": 2.6702702702702705, - "grad_norm": 2.813094139099121, - "learning_rate": 4.173877895734875e-06, - "loss": 0.3597, - "step": 494 - }, - { - "epoch": 2.6756756756756754, - "grad_norm": 2.4037158489227295, - "learning_rate": 4.1707221485591764e-06, - "loss": 0.3284, - "step": 495 - }, - { - "epoch": 2.6810810810810812, - "grad_norm": 3.049436092376709, - "learning_rate": 4.167561583454272e-06, - "loss": 0.257, - "step": 496 - }, - { - "epoch": 2.6864864864864866, - "grad_norm": 3.458923816680908, - "learning_rate": 4.164396209534411e-06, - "loss": 0.1819, - "step": 497 - }, - { - "epoch": 2.691891891891892, - "grad_norm": 3.3084232807159424, - "learning_rate": 4.161226035927711e-06, - "loss": 0.7109, - "step": 498 - }, - { - "epoch": 2.6972972972972973, - "grad_norm": 3.034550189971924, - "learning_rate": 4.15805107177613e-06, - "loss": 0.6297, - "step": 499 - }, - { - "epoch": 2.7027027027027026, - "grad_norm": 3.5786449909210205, - "learning_rate": 4.15487132623544e-06, - "loss": 0.5195, - "step": 500 - }, - { - "epoch": 2.708108108108108, - "grad_norm": 3.4477646350860596, - "learning_rate": 4.151686808475204e-06, - "loss": 0.2528, - "step": 501 - }, - { - "epoch": 2.7135135135135133, - "grad_norm": 3.0256869792938232, - "learning_rate": 4.148497527678744e-06, - "loss": 0.5013, - "step": 502 - }, - { - "epoch": 2.718918918918919, - "grad_norm": 2.875121593475342, - "learning_rate": 4.145303493043118e-06, - "loss": 0.4109, - "step": 503 - }, - { - "epoch": 2.7243243243243245, - "grad_norm": 2.7204222679138184, - "learning_rate": 4.1421047137790935e-06, - "loss": 0.3197, - "step": 504 - }, - { - "epoch": 2.72972972972973, - "grad_norm": 3.350482702255249, - "learning_rate": 4.13890119911112e-06, - "loss": 0.6369, - "step": 505 - }, - { - "epoch": 2.735135135135135, - "grad_norm": 3.096774101257324, - "learning_rate": 4.135692958277303e-06, - "loss": 0.4581, - "step": 506 - }, - { - "epoch": 2.7405405405405405, - "grad_norm": 2.8896536827087402, - "learning_rate": 4.132480000529375e-06, - "loss": 0.6217, - "step": 507 - }, - { - "epoch": 2.745945945945946, - "grad_norm": 2.643932580947876, - "learning_rate": 4.129262335132676e-06, - "loss": 0.4951, - "step": 508 - }, - { - "epoch": 2.7513513513513512, - "grad_norm": 2.6077864170074463, - "learning_rate": 4.126039971366114e-06, - "loss": 0.2185, - "step": 509 - }, - { - "epoch": 2.756756756756757, - "grad_norm": 2.531507968902588, - "learning_rate": 4.122812918522154e-06, - "loss": 0.5428, - "step": 510 - }, - { - "epoch": 2.762162162162162, - "grad_norm": 4.125836372375488, - "learning_rate": 4.119581185906776e-06, - "loss": 0.5466, - "step": 511 - }, - { - "epoch": 2.7675675675675677, - "grad_norm": 2.9921016693115234, - "learning_rate": 4.1163447828394595e-06, - "loss": 0.3803, - "step": 512 - }, - { - "epoch": 2.772972972972973, - "grad_norm": 2.9517931938171387, - "learning_rate": 4.113103718653152e-06, - "loss": 0.2722, - "step": 513 - }, - { - "epoch": 2.7783783783783784, - "grad_norm": 2.8333382606506348, - "learning_rate": 4.10985800269424e-06, - "loss": 0.333, - "step": 514 - }, - { - "epoch": 2.7837837837837838, - "grad_norm": 2.94168758392334, - "learning_rate": 4.106607644322529e-06, - "loss": 0.2186, - "step": 515 - }, - { - "epoch": 2.789189189189189, - "grad_norm": 3.2743892669677734, - "learning_rate": 4.103352652911207e-06, - "loss": 0.6365, - "step": 516 - }, - { - "epoch": 2.7945945945945945, - "grad_norm": 4.692770004272461, - "learning_rate": 4.100093037846825e-06, - "loss": 0.7261, - "step": 517 - }, - { - "epoch": 2.8, - "grad_norm": 3.2157247066497803, - "learning_rate": 4.0968288085292675e-06, - "loss": 0.2767, - "step": 518 - }, - { - "epoch": 2.8054054054054056, - "grad_norm": 3.196887731552124, - "learning_rate": 4.093559974371725e-06, - "loss": 0.4743, - "step": 519 - }, - { - "epoch": 2.810810810810811, - "grad_norm": 2.406752586364746, - "learning_rate": 4.090286544800667e-06, - "loss": 0.3789, - "step": 520 - }, - { - "epoch": 2.8162162162162163, - "grad_norm": 3.1769447326660156, - "learning_rate": 4.087008529255815e-06, - "loss": 0.6252, - "step": 521 - }, - { - "epoch": 2.8216216216216217, - "grad_norm": 3.068370819091797, - "learning_rate": 4.083725937190115e-06, - "loss": 0.3467, - "step": 522 - }, - { - "epoch": 2.827027027027027, - "grad_norm": 3.2665855884552, - "learning_rate": 4.0804387780697114e-06, - "loss": 0.3857, - "step": 523 - }, - { - "epoch": 2.8324324324324324, - "grad_norm": 3.368759870529175, - "learning_rate": 4.077147061373918e-06, - "loss": 0.4679, - "step": 524 - }, - { - "epoch": 2.8378378378378377, - "grad_norm": 3.989163875579834, - "learning_rate": 4.073850796595192e-06, - "loss": 0.2439, - "step": 525 - }, - { - "epoch": 2.8432432432432435, - "grad_norm": 3.6244685649871826, - "learning_rate": 4.070549993239106e-06, - "loss": 0.435, - "step": 526 - }, - { - "epoch": 2.8486486486486484, - "grad_norm": 3.585151195526123, - "learning_rate": 4.06724466082432e-06, - "loss": 0.5022, - "step": 527 - }, - { - "epoch": 2.854054054054054, - "grad_norm": 3.2420976161956787, - "learning_rate": 4.063934808882555e-06, - "loss": 0.4282, - "step": 528 - }, - { - "epoch": 2.8594594594594596, - "grad_norm": 3.1674294471740723, - "learning_rate": 4.0606204469585656e-06, - "loss": 0.3436, - "step": 529 - }, - { - "epoch": 2.864864864864865, - "grad_norm": 2.6856706142425537, - "learning_rate": 4.057301584610112e-06, - "loss": 0.3889, - "step": 530 - }, - { - "epoch": 2.8702702702702703, - "grad_norm": 3.0438942909240723, - "learning_rate": 4.053978231407931e-06, - "loss": 0.4828, - "step": 531 - }, - { - "epoch": 2.8756756756756756, - "grad_norm": 3.3561246395111084, - "learning_rate": 4.0506503969357115e-06, - "loss": 0.5814, - "step": 532 - }, - { - "epoch": 2.881081081081081, - "grad_norm": 2.5318350791931152, - "learning_rate": 4.047318090790065e-06, - "loss": 0.4768, - "step": 533 - }, - { - "epoch": 2.8864864864864863, - "grad_norm": 2.587224006652832, - "learning_rate": 4.043981322580498e-06, - "loss": 0.4262, - "step": 534 - }, - { - "epoch": 2.891891891891892, - "grad_norm": 2.73926043510437, - "learning_rate": 4.040640101929384e-06, - "loss": 0.421, - "step": 535 - }, - { - "epoch": 2.8972972972972975, - "grad_norm": 3.53908371925354, - "learning_rate": 4.037294438471936e-06, - "loss": 0.4019, - "step": 536 - }, - { - "epoch": 2.902702702702703, - "grad_norm": 3.0980448722839355, - "learning_rate": 4.033944341856181e-06, - "loss": 0.4322, - "step": 537 - }, - { - "epoch": 2.908108108108108, - "grad_norm": 2.9265666007995605, - "learning_rate": 4.030589821742926e-06, - "loss": 0.3841, - "step": 538 - }, - { - "epoch": 2.9135135135135135, - "grad_norm": 3.4082043170928955, - "learning_rate": 4.0272308878057385e-06, - "loss": 0.7083, - "step": 539 - }, - { - "epoch": 2.918918918918919, - "grad_norm": 3.297515630722046, - "learning_rate": 4.023867549730912e-06, - "loss": 0.5688, - "step": 540 - }, - { - "epoch": 2.924324324324324, - "grad_norm": 3.0538225173950195, - "learning_rate": 4.020499817217441e-06, - "loss": 0.5979, - "step": 541 - }, - { - "epoch": 2.92972972972973, - "grad_norm": 3.1792757511138916, - "learning_rate": 4.017127699976992e-06, - "loss": 0.5034, - "step": 542 - }, - { - "epoch": 2.935135135135135, - "grad_norm": 3.1574482917785645, - "learning_rate": 4.013751207733877e-06, - "loss": 0.6656, - "step": 543 - }, - { - "epoch": 2.9405405405405407, - "grad_norm": 2.523123264312744, - "learning_rate": 4.010370350225023e-06, - "loss": 0.2789, - "step": 544 - }, - { - "epoch": 2.945945945945946, - "grad_norm": 3.1950793266296387, - "learning_rate": 4.006985137199945e-06, - "loss": 0.2163, - "step": 545 - }, - { - "epoch": 2.9513513513513514, - "grad_norm": 3.2089648246765137, - "learning_rate": 4.00359557842072e-06, - "loss": 0.4179, - "step": 546 - }, - { - "epoch": 2.9567567567567568, - "grad_norm": 3.852578639984131, - "learning_rate": 4.000201683661958e-06, - "loss": 0.4683, - "step": 547 - }, - { - "epoch": 2.962162162162162, - "grad_norm": 2.7612597942352295, - "learning_rate": 3.996803462710766e-06, - "loss": 0.3506, - "step": 548 - }, - { - "epoch": 2.9675675675675675, - "grad_norm": 4.811823844909668, - "learning_rate": 3.993400925366736e-06, - "loss": 0.6582, - "step": 549 - }, - { - "epoch": 2.972972972972973, - "grad_norm": 3.0135858058929443, - "learning_rate": 3.989994081441902e-06, - "loss": 0.504, - "step": 550 - }, - { - "epoch": 2.9783783783783786, - "grad_norm": 2.710277795791626, - "learning_rate": 3.986582940760717e-06, - "loss": 0.7362, - "step": 551 - }, - { - "epoch": 2.983783783783784, - "grad_norm": 3.175443649291992, - "learning_rate": 3.983167513160025e-06, - "loss": 0.4116, - "step": 552 - }, - { - "epoch": 2.9891891891891893, - "grad_norm": 3.101109743118286, - "learning_rate": 3.979747808489036e-06, - "loss": 0.2188, - "step": 553 - }, - { - "epoch": 2.9945945945945946, - "grad_norm": 3.2320079803466797, - "learning_rate": 3.976323836609289e-06, - "loss": 0.7558, - "step": 554 - }, - { - "epoch": 3.0, - "grad_norm": 3.6071934700012207, - "learning_rate": 3.9728956073946305e-06, - "loss": 0.6491, - "step": 555 - }, - { - "epoch": 3.0054054054054054, - "grad_norm": 3.1119353771209717, - "learning_rate": 3.969463130731183e-06, - "loss": 0.1625, - "step": 556 - }, - { - "epoch": 3.0108108108108107, - "grad_norm": 3.0440328121185303, - "learning_rate": 3.966026416517321e-06, - "loss": 0.311, - "step": 557 - }, - { - "epoch": 3.016216216216216, - "grad_norm": 4.069122791290283, - "learning_rate": 3.962585474663636e-06, - "loss": 0.5299, - "step": 558 - }, - { - "epoch": 3.0216216216216214, - "grad_norm": 2.878645896911621, - "learning_rate": 3.959140315092911e-06, - "loss": 0.2718, - "step": 559 - }, - { - "epoch": 3.027027027027027, - "grad_norm": 3.526695966720581, - "learning_rate": 3.955690947740092e-06, - "loss": 0.2954, - "step": 560 - }, - { - "epoch": 3.0324324324324325, - "grad_norm": 3.25087308883667, - "learning_rate": 3.95223738255226e-06, - "loss": 0.2388, - "step": 561 - }, - { - "epoch": 3.037837837837838, - "grad_norm": 3.5467700958251953, - "learning_rate": 3.9487796294886015e-06, - "loss": 0.2014, - "step": 562 - }, - { - "epoch": 3.0432432432432432, - "grad_norm": 4.397517681121826, - "learning_rate": 3.945317698520379e-06, - "loss": 0.2102, - "step": 563 - }, - { - "epoch": 3.0486486486486486, - "grad_norm": 3.7297182083129883, - "learning_rate": 3.941851599630903e-06, - "loss": 0.499, - "step": 564 - }, - { - "epoch": 3.054054054054054, - "grad_norm": 4.417158603668213, - "learning_rate": 3.938381342815503e-06, - "loss": 0.3392, - "step": 565 - }, - { - "epoch": 3.0594594594594593, - "grad_norm": 4.6037421226501465, - "learning_rate": 3.934906938081499e-06, - "loss": 0.1942, - "step": 566 - }, - { - "epoch": 3.064864864864865, - "grad_norm": 3.5600531101226807, - "learning_rate": 3.931428395448174e-06, - "loss": 0.1753, - "step": 567 - }, - { - "epoch": 3.0702702702702704, - "grad_norm": 2.868013381958008, - "learning_rate": 3.927945724946743e-06, - "loss": 0.2959, - "step": 568 - }, - { - "epoch": 3.075675675675676, - "grad_norm": 3.5543227195739746, - "learning_rate": 3.924458936620322e-06, - "loss": 0.4625, - "step": 569 - }, - { - "epoch": 3.081081081081081, - "grad_norm": 8.972922325134277, - "learning_rate": 3.920968040523904e-06, - "loss": 0.2571, - "step": 570 - }, - { - "epoch": 3.0864864864864865, - "grad_norm": 3.037388324737549, - "learning_rate": 3.917473046724329e-06, - "loss": 0.1438, - "step": 571 - }, - { - "epoch": 3.091891891891892, - "grad_norm": 3.3261702060699463, - "learning_rate": 3.9139739653002525e-06, - "loss": 0.3572, - "step": 572 - }, - { - "epoch": 3.097297297297297, - "grad_norm": 2.425293207168579, - "learning_rate": 3.910470806342117e-06, - "loss": 0.165, - "step": 573 - }, - { - "epoch": 3.1027027027027025, - "grad_norm": 3.5718603134155273, - "learning_rate": 3.9069635799521245e-06, - "loss": 0.3209, - "step": 574 - }, - { - "epoch": 3.108108108108108, - "grad_norm": 3.8211171627044678, - "learning_rate": 3.903452296244204e-06, - "loss": 0.1976, - "step": 575 - }, - { - "epoch": 3.1135135135135137, - "grad_norm": 5.944535255432129, - "learning_rate": 3.899936965343989e-06, - "loss": 0.6074, - "step": 576 - }, - { - "epoch": 3.118918918918919, - "grad_norm": 6.603860378265381, - "learning_rate": 3.89641759738878e-06, - "loss": 0.4051, - "step": 577 - }, - { - "epoch": 3.1243243243243244, - "grad_norm": 6.712981700897217, - "learning_rate": 3.892894202527523e-06, - "loss": 0.3787, - "step": 578 - }, - { - "epoch": 3.1297297297297297, - "grad_norm": 3.267186403274536, - "learning_rate": 3.8893667909207735e-06, - "loss": 0.0927, - "step": 579 - }, - { - "epoch": 3.135135135135135, - "grad_norm": 4.476837158203125, - "learning_rate": 3.88583537274067e-06, - "loss": 0.4706, - "step": 580 - }, - { - "epoch": 3.1405405405405404, - "grad_norm": 4.272335052490234, - "learning_rate": 3.8822999581709085e-06, - "loss": 0.3949, - "step": 581 - }, - { - "epoch": 3.145945945945946, - "grad_norm": 3.6685309410095215, - "learning_rate": 3.878760557406708e-06, - "loss": 0.1971, - "step": 582 - }, - { - "epoch": 3.1513513513513516, - "grad_norm": 3.9899449348449707, - "learning_rate": 3.875217180654779e-06, - "loss": 0.5156, - "step": 583 - }, - { - "epoch": 3.156756756756757, - "grad_norm": 3.866804361343384, - "learning_rate": 3.871669838133303e-06, - "loss": 0.3552, - "step": 584 - }, - { - "epoch": 3.1621621621621623, - "grad_norm": 3.565648317337036, - "learning_rate": 3.868118540071894e-06, - "loss": 0.4369, - "step": 585 - }, - { - "epoch": 3.1675675675675676, - "grad_norm": 3.5073986053466797, - "learning_rate": 3.8645632967115755e-06, - "loss": 0.3694, - "step": 586 - }, - { - "epoch": 3.172972972972973, - "grad_norm": 3.7636868953704834, - "learning_rate": 3.861004118304746e-06, - "loss": 0.3404, - "step": 587 - }, - { - "epoch": 3.1783783783783783, - "grad_norm": 2.940094232559204, - "learning_rate": 3.857441015115154e-06, - "loss": 0.3086, - "step": 588 - }, - { - "epoch": 3.1837837837837837, - "grad_norm": 3.727414608001709, - "learning_rate": 3.8538739974178635e-06, - "loss": 0.253, - "step": 589 - }, - { - "epoch": 3.189189189189189, - "grad_norm": 3.5140156745910645, - "learning_rate": 3.850303075499227e-06, - "loss": 0.2436, - "step": 590 - }, - { - "epoch": 3.1945945945945944, - "grad_norm": 3.545952558517456, - "learning_rate": 3.84672825965686e-06, - "loss": 0.328, - "step": 591 - }, - { - "epoch": 3.2, - "grad_norm": 3.534240484237671, - "learning_rate": 3.843149560199601e-06, - "loss": 0.2687, - "step": 592 - }, - { - "epoch": 3.2054054054054055, - "grad_norm": 2.8464927673339844, - "learning_rate": 3.839566987447492e-06, - "loss": 0.1417, - "step": 593 - }, - { - "epoch": 3.210810810810811, - "grad_norm": 4.138559818267822, - "learning_rate": 3.835980551731743e-06, - "loss": 0.2106, - "step": 594 - }, - { - "epoch": 3.2162162162162162, - "grad_norm": 2.917670249938965, - "learning_rate": 3.8323902633947045e-06, - "loss": 0.3154, - "step": 595 - }, - { - "epoch": 3.2216216216216216, - "grad_norm": 3.029660224914551, - "learning_rate": 3.828796132789835e-06, - "loss": 0.1218, - "step": 596 - }, - { - "epoch": 3.227027027027027, - "grad_norm": 3.2845771312713623, - "learning_rate": 3.825198170281677e-06, - "loss": 0.1336, - "step": 597 - }, - { - "epoch": 3.2324324324324323, - "grad_norm": 3.1375670433044434, - "learning_rate": 3.821596386245819e-06, - "loss": 0.2518, - "step": 598 - }, - { - "epoch": 3.237837837837838, - "grad_norm": 3.0021941661834717, - "learning_rate": 3.817990791068874e-06, - "loss": 0.2762, - "step": 599 - }, - { - "epoch": 3.2432432432432434, - "grad_norm": 4.141000747680664, - "learning_rate": 3.81438139514844e-06, - "loss": 0.2722, - "step": 600 - }, - { - "epoch": 3.2486486486486488, - "grad_norm": 3.9065279960632324, - "learning_rate": 3.8107682088930797e-06, - "loss": 0.3542, - "step": 601 - }, - { - "epoch": 3.254054054054054, - "grad_norm": 3.718417167663574, - "learning_rate": 3.807151242722286e-06, - "loss": 0.344, - "step": 602 - }, - { - "epoch": 3.2594594594594595, - "grad_norm": 4.013717174530029, - "learning_rate": 3.8035305070664484e-06, - "loss": 0.1625, - "step": 603 - }, - { - "epoch": 3.264864864864865, - "grad_norm": 3.348888397216797, - "learning_rate": 3.7999060123668318e-06, - "loss": 0.2925, - "step": 604 - }, - { - "epoch": 3.27027027027027, - "grad_norm": 3.496079206466675, - "learning_rate": 3.7962777690755364e-06, - "loss": 0.1523, - "step": 605 - }, - { - "epoch": 3.2756756756756755, - "grad_norm": 3.07607102394104, - "learning_rate": 3.792645787655476e-06, - "loss": 0.1674, - "step": 606 - }, - { - "epoch": 3.281081081081081, - "grad_norm": 3.4036154747009277, - "learning_rate": 3.7890100785803425e-06, - "loss": 0.2856, - "step": 607 - }, - { - "epoch": 3.2864864864864867, - "grad_norm": 6.092559337615967, - "learning_rate": 3.785370652334577e-06, - "loss": 0.1094, - "step": 608 - }, - { - "epoch": 3.291891891891892, - "grad_norm": 3.9322001934051514, - "learning_rate": 3.7817275194133403e-06, - "loss": 0.2611, - "step": 609 - }, - { - "epoch": 3.2972972972972974, - "grad_norm": 3.189563274383545, - "learning_rate": 3.778080690322483e-06, - "loss": 0.1315, - "step": 610 - }, - { - "epoch": 3.3027027027027027, - "grad_norm": 4.304934024810791, - "learning_rate": 3.774430175578514e-06, - "loss": 0.1686, - "step": 611 - }, - { - "epoch": 3.308108108108108, - "grad_norm": 2.9030067920684814, - "learning_rate": 3.7707759857085706e-06, - "loss": 0.4642, - "step": 612 - }, - { - "epoch": 3.3135135135135134, - "grad_norm": 3.7485930919647217, - "learning_rate": 3.7671181312503886e-06, - "loss": 0.1987, - "step": 613 - }, - { - "epoch": 3.3189189189189188, - "grad_norm": 3.4700896739959717, - "learning_rate": 3.763456622752271e-06, - "loss": 0.3307, - "step": 614 - }, - { - "epoch": 3.3243243243243246, - "grad_norm": 3.0079376697540283, - "learning_rate": 3.7597914707730583e-06, - "loss": 0.1731, - "step": 615 - }, - { - "epoch": 3.32972972972973, - "grad_norm": 3.155235767364502, - "learning_rate": 3.7561226858820984e-06, - "loss": 0.2003, - "step": 616 - }, - { - "epoch": 3.3351351351351353, - "grad_norm": 3.847895622253418, - "learning_rate": 3.7524502786592143e-06, - "loss": 0.4014, - "step": 617 - }, - { - "epoch": 3.3405405405405406, - "grad_norm": 2.7505502700805664, - "learning_rate": 3.7487742596946753e-06, - "loss": 0.205, - "step": 618 - }, - { - "epoch": 3.345945945945946, - "grad_norm": 3.654529571533203, - "learning_rate": 3.7450946395891674e-06, - "loss": 0.2932, - "step": 619 - }, - { - "epoch": 3.3513513513513513, - "grad_norm": 2.9763967990875244, - "learning_rate": 3.7414114289537593e-06, - "loss": 0.2748, - "step": 620 - }, - { - "epoch": 3.3567567567567567, - "grad_norm": 3.889683961868286, - "learning_rate": 3.7377246384098763e-06, - "loss": 0.3665, - "step": 621 - }, - { - "epoch": 3.362162162162162, - "grad_norm": 4.193166732788086, - "learning_rate": 3.7340342785892645e-06, - "loss": 0.3453, - "step": 622 - }, - { - "epoch": 3.3675675675675674, - "grad_norm": 3.4371488094329834, - "learning_rate": 3.7303403601339646e-06, - "loss": 0.473, - "step": 623 - }, - { - "epoch": 3.372972972972973, - "grad_norm": 3.6939027309417725, - "learning_rate": 3.726642893696279e-06, - "loss": 0.3017, - "step": 624 - }, - { - "epoch": 3.3783783783783785, - "grad_norm": 4.904304504394531, - "learning_rate": 3.7229418899387414e-06, - "loss": 0.4841, - "step": 625 - }, - { - "epoch": 3.383783783783784, - "grad_norm": 3.6373438835144043, - "learning_rate": 3.719237359534087e-06, - "loss": 0.3879, - "step": 626 - }, - { - "epoch": 3.389189189189189, - "grad_norm": 3.403676986694336, - "learning_rate": 3.71552931316522e-06, - "loss": 0.3876, - "step": 627 - }, - { - "epoch": 3.3945945945945946, - "grad_norm": 3.2292237281799316, - "learning_rate": 3.7118177615251834e-06, - "loss": 0.4491, - "step": 628 - }, - { - "epoch": 3.4, - "grad_norm": 3.317850351333618, - "learning_rate": 3.70810271531713e-06, - "loss": 0.3763, - "step": 629 - }, - { - "epoch": 3.4054054054054053, - "grad_norm": 3.664735794067383, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.4171, - "step": 630 - }, - { - "epoch": 3.410810810810811, - "grad_norm": 3.781569242477417, - "learning_rate": 3.700662182059936e-06, - "loss": 0.2445, - "step": 631 - }, - { - "epoch": 3.4162162162162164, - "grad_norm": 2.878260850906372, - "learning_rate": 3.696936716467363e-06, - "loss": 0.1347, - "step": 632 - }, - { - "epoch": 3.4216216216216218, - "grad_norm": 2.8670761585235596, - "learning_rate": 3.693207799219846e-06, - "loss": 0.2822, - "step": 633 - }, - { - "epoch": 3.427027027027027, - "grad_norm": 3.9338245391845703, - "learning_rate": 3.689475441070615e-06, - "loss": 0.3425, - "step": 634 - }, - { - "epoch": 3.4324324324324325, - "grad_norm": 3.3172149658203125, - "learning_rate": 3.685739652782822e-06, - "loss": 0.3315, - "step": 635 - }, - { - "epoch": 3.437837837837838, - "grad_norm": 3.9986648559570312, - "learning_rate": 3.682000445129512e-06, - "loss": 0.1841, - "step": 636 - }, - { - "epoch": 3.443243243243243, - "grad_norm": 3.4503986835479736, - "learning_rate": 3.6782578288935896e-06, - "loss": 0.3151, - "step": 637 - }, - { - "epoch": 3.4486486486486485, - "grad_norm": 3.8826167583465576, - "learning_rate": 3.6745118148677882e-06, - "loss": 0.1272, - "step": 638 - }, - { - "epoch": 3.454054054054054, - "grad_norm": 3.0585904121398926, - "learning_rate": 3.6707624138546414e-06, - "loss": 0.2436, - "step": 639 - }, - { - "epoch": 3.4594594594594597, - "grad_norm": 3.8409557342529297, - "learning_rate": 3.6670096366664477e-06, - "loss": 0.6321, - "step": 640 - }, - { - "epoch": 3.464864864864865, - "grad_norm": 3.7260093688964844, - "learning_rate": 3.663253494125244e-06, - "loss": 0.1262, - "step": 641 - }, - { - "epoch": 3.4702702702702704, - "grad_norm": 3.195587396621704, - "learning_rate": 3.6594939970627706e-06, - "loss": 0.2669, - "step": 642 - }, - { - "epoch": 3.4756756756756757, - "grad_norm": 2.565070629119873, - "learning_rate": 3.655731156320441e-06, - "loss": 0.1228, - "step": 643 - }, - { - "epoch": 3.481081081081081, - "grad_norm": 3.745422124862671, - "learning_rate": 3.651964982749312e-06, - "loss": 0.1759, - "step": 644 - }, - { - "epoch": 3.4864864864864864, - "grad_norm": 4.96168327331543, - "learning_rate": 3.648195487210051e-06, - "loss": 0.5677, - "step": 645 - }, - { - "epoch": 3.4918918918918918, - "grad_norm": 3.514446496963501, - "learning_rate": 3.644422680572906e-06, - "loss": 0.1874, - "step": 646 - }, - { - "epoch": 3.4972972972972975, - "grad_norm": 3.1427719593048096, - "learning_rate": 3.640646573717671e-06, - "loss": 0.3225, - "step": 647 - }, - { - "epoch": 3.5027027027027025, - "grad_norm": 3.32208514213562, - "learning_rate": 3.63686717753366e-06, - "loss": 0.102, - "step": 648 - }, - { - "epoch": 3.5081081081081082, - "grad_norm": 3.409299373626709, - "learning_rate": 3.6330845029196697e-06, - "loss": 0.1585, - "step": 649 - }, - { - "epoch": 3.5135135135135136, - "grad_norm": 2.827052116394043, - "learning_rate": 3.629298560783952e-06, - "loss": 0.3046, - "step": 650 - }, - { - "epoch": 3.518918918918919, - "grad_norm": 3.541518211364746, - "learning_rate": 3.6255093620441835e-06, - "loss": 0.2037, - "step": 651 - }, - { - "epoch": 3.5243243243243243, - "grad_norm": 3.067040205001831, - "learning_rate": 3.6217169176274293e-06, - "loss": 0.1784, - "step": 652 - }, - { - "epoch": 3.5297297297297296, - "grad_norm": 4.001040935516357, - "learning_rate": 3.6179212384701146e-06, - "loss": 0.1974, - "step": 653 - }, - { - "epoch": 3.535135135135135, - "grad_norm": 4.03037691116333, - "learning_rate": 3.6141223355179946e-06, - "loss": 0.2161, - "step": 654 - }, - { - "epoch": 3.5405405405405403, - "grad_norm": 3.303591728210449, - "learning_rate": 3.610320219726118e-06, - "loss": 0.1487, - "step": 655 - }, - { - "epoch": 3.545945945945946, - "grad_norm": 4.183008193969727, - "learning_rate": 3.606514902058802e-06, - "loss": 0.2231, - "step": 656 - }, - { - "epoch": 3.5513513513513515, - "grad_norm": 4.2100300788879395, - "learning_rate": 3.602706393489594e-06, - "loss": 0.5068, - "step": 657 - }, - { - "epoch": 3.556756756756757, - "grad_norm": 4.521003246307373, - "learning_rate": 3.598894705001246e-06, - "loss": 0.4621, - "step": 658 - }, - { - "epoch": 3.562162162162162, - "grad_norm": 3.452348470687866, - "learning_rate": 3.5950798475856783e-06, - "loss": 0.285, - "step": 659 - }, - { - "epoch": 3.5675675675675675, - "grad_norm": 3.468987464904785, - "learning_rate": 3.5912618322439487e-06, - "loss": 0.4277, - "step": 660 - }, - { - "epoch": 3.572972972972973, - "grad_norm": 3.431551933288574, - "learning_rate": 3.587440669986224e-06, - "loss": 0.1993, - "step": 661 - }, - { - "epoch": 3.5783783783783782, - "grad_norm": 3.017648220062256, - "learning_rate": 3.5836163718317453e-06, - "loss": 0.272, - "step": 662 - }, - { - "epoch": 3.583783783783784, - "grad_norm": 3.837244987487793, - "learning_rate": 3.5797889488087946e-06, - "loss": 0.6019, - "step": 663 - }, - { - "epoch": 3.589189189189189, - "grad_norm": 3.221762180328369, - "learning_rate": 3.575958411954668e-06, - "loss": 0.3603, - "step": 664 - }, - { - "epoch": 3.5945945945945947, - "grad_norm": 4.279484272003174, - "learning_rate": 3.5721247723156393e-06, - "loss": 0.4656, - "step": 665 - }, - { - "epoch": 3.6, - "grad_norm": 3.723459243774414, - "learning_rate": 3.5682880409469316e-06, - "loss": 0.2466, - "step": 666 - }, - { - "epoch": 3.6054054054054054, - "grad_norm": 2.7260632514953613, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1848, - "step": 667 - }, - { - "epoch": 3.610810810810811, - "grad_norm": 3.6656649112701416, - "learning_rate": 3.5606053472859124e-06, - "loss": 0.4968, - "step": 668 - }, - { - "epoch": 3.616216216216216, - "grad_norm": 4.570294380187988, - "learning_rate": 3.556759407148496e-06, - "loss": 0.316, - "step": 669 - }, - { - "epoch": 3.6216216216216215, - "grad_norm": 3.174433946609497, - "learning_rate": 3.5529104195911258e-06, - "loss": 0.2232, - "step": 670 - }, - { - "epoch": 3.627027027027027, - "grad_norm": 4.481954574584961, - "learning_rate": 3.549058395713285e-06, - "loss": 0.4435, - "step": 671 - }, - { - "epoch": 3.6324324324324326, - "grad_norm": 3.8758301734924316, - "learning_rate": 3.54520334662321e-06, - "loss": 0.1455, - "step": 672 - }, - { - "epoch": 3.637837837837838, - "grad_norm": 3.1699628829956055, - "learning_rate": 3.5413452834378626e-06, - "loss": 0.3037, - "step": 673 - }, - { - "epoch": 3.6432432432432433, - "grad_norm": 3.8971962928771973, - "learning_rate": 3.5374842172828953e-06, - "loss": 0.4309, - "step": 674 - }, - { - "epoch": 3.6486486486486487, - "grad_norm": 3.3087549209594727, - "learning_rate": 3.533620159292621e-06, - "loss": 0.383, - "step": 675 - }, - { - "epoch": 3.654054054054054, - "grad_norm": 2.9413082599639893, - "learning_rate": 3.529753120609982e-06, - "loss": 0.1963, - "step": 676 - }, - { - "epoch": 3.6594594594594594, - "grad_norm": 3.309837818145752, - "learning_rate": 3.5258831123865136e-06, - "loss": 0.1922, - "step": 677 - }, - { - "epoch": 3.6648648648648647, - "grad_norm": 4.124879360198975, - "learning_rate": 3.5220101457823147e-06, - "loss": 0.5589, - "step": 678 - }, - { - "epoch": 3.6702702702702705, - "grad_norm": 3.2587103843688965, - "learning_rate": 3.5181342319660174e-06, - "loss": 0.1757, - "step": 679 - }, - { - "epoch": 3.6756756756756754, - "grad_norm": 4.179666042327881, - "learning_rate": 3.5142553821147498e-06, - "loss": 0.1208, - "step": 680 - }, - { - "epoch": 3.6810810810810812, - "grad_norm": 3.4041192531585693, - "learning_rate": 3.5103736074141106e-06, - "loss": 0.2416, - "step": 681 - }, - { - "epoch": 3.6864864864864866, - "grad_norm": 4.982706546783447, - "learning_rate": 3.5064889190581293e-06, - "loss": 0.3841, - "step": 682 - }, - { - "epoch": 3.691891891891892, - "grad_norm": 3.5895309448242188, - "learning_rate": 3.5026013282492406e-06, - "loss": 0.3723, - "step": 683 - }, - { - "epoch": 3.6972972972972973, - "grad_norm": 3.4824306964874268, - "learning_rate": 3.498710846198247e-06, - "loss": 0.4403, - "step": 684 - }, - { - "epoch": 3.7027027027027026, - "grad_norm": 3.501023054122925, - "learning_rate": 3.494817484124289e-06, - "loss": 0.2813, - "step": 685 - }, - { - "epoch": 3.708108108108108, - "grad_norm": 3.934908151626587, - "learning_rate": 3.490921253254813e-06, - "loss": 0.4287, - "step": 686 - }, - { - "epoch": 3.7135135135135133, - "grad_norm": 3.24141526222229, - "learning_rate": 3.487022164825539e-06, - "loss": 0.234, - "step": 687 - }, - { - "epoch": 3.718918918918919, - "grad_norm": 3.3419880867004395, - "learning_rate": 3.4831202300804246e-06, - "loss": 0.2135, - "step": 688 - }, - { - "epoch": 3.7243243243243245, - "grad_norm": 3.923778772354126, - "learning_rate": 3.479215460271638e-06, - "loss": 0.2725, - "step": 689 - }, - { - "epoch": 3.72972972972973, - "grad_norm": 3.2432096004486084, - "learning_rate": 3.475307866659522e-06, - "loss": 0.228, - "step": 690 - }, - { - "epoch": 3.735135135135135, - "grad_norm": 3.0307705402374268, - "learning_rate": 3.4713974605125634e-06, - "loss": 0.0985, - "step": 691 - }, - { - "epoch": 3.7405405405405405, - "grad_norm": 2.778942346572876, - "learning_rate": 3.4674842531073587e-06, - "loss": 0.2137, - "step": 692 - }, - { - "epoch": 3.745945945945946, - "grad_norm": 3.711315155029297, - "learning_rate": 3.4635682557285833e-06, - "loss": 0.1707, - "step": 693 - }, - { - "epoch": 3.7513513513513512, - "grad_norm": 3.165668487548828, - "learning_rate": 3.459649479668956e-06, - "loss": 0.3021, - "step": 694 - }, - { - "epoch": 3.756756756756757, - "grad_norm": 3.7491254806518555, - "learning_rate": 3.4557279362292117e-06, - "loss": 0.3457, - "step": 695 - }, - { - "epoch": 3.762162162162162, - "grad_norm": 3.271603584289551, - "learning_rate": 3.451803636718064e-06, - "loss": 0.1193, - "step": 696 - }, - { - "epoch": 3.7675675675675677, - "grad_norm": 3.872382402420044, - "learning_rate": 3.447876592452174e-06, - "loss": 0.2261, - "step": 697 - }, - { - "epoch": 3.772972972972973, - "grad_norm": 4.634008407592773, - "learning_rate": 3.4439468147561196e-06, - "loss": 0.5042, - "step": 698 - }, - { - "epoch": 3.7783783783783784, - "grad_norm": 3.6930148601531982, - "learning_rate": 3.440014314962358e-06, - "loss": 0.3481, - "step": 699 - }, - { - "epoch": 3.7837837837837838, - "grad_norm": 4.709466457366943, - "learning_rate": 3.4360791044112e-06, - "loss": 0.2317, - "step": 700 - }, - { - "epoch": 3.789189189189189, - "grad_norm": 4.37923002243042, - "learning_rate": 3.432141194450772e-06, - "loss": 0.395, - "step": 701 - }, - { - "epoch": 3.7945945945945945, - "grad_norm": 3.1600489616394043, - "learning_rate": 3.4282005964369836e-06, - "loss": 0.1767, - "step": 702 - }, - { - "epoch": 3.8, - "grad_norm": 3.9799487590789795, - "learning_rate": 3.424257321733497e-06, - "loss": 0.2146, - "step": 703 - }, - { - "epoch": 3.8054054054054056, - "grad_norm": 2.79176664352417, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.1534, - "step": 704 - }, - { - "epoch": 3.810810810810811, - "grad_norm": 3.0024254322052, - "learning_rate": 3.4163627877506434e-06, - "loss": 0.2513, - "step": 705 - }, - { - "epoch": 3.8162162162162163, - "grad_norm": 2.924475908279419, - "learning_rate": 3.4124115512370636e-06, - "loss": 0.4154, - "step": 706 - }, - { - "epoch": 3.8216216216216217, - "grad_norm": 3.2713992595672607, - "learning_rate": 3.408457683565295e-06, - "loss": 0.1822, - "step": 707 - }, - { - "epoch": 3.827027027027027, - "grad_norm": 3.094003438949585, - "learning_rate": 3.4045011961372675e-06, - "loss": 0.3589, - "step": 708 - }, - { - "epoch": 3.8324324324324324, - "grad_norm": 3.423858404159546, - "learning_rate": 3.4005421003624637e-06, - "loss": 0.4615, - "step": 709 - }, - { - "epoch": 3.8378378378378377, - "grad_norm": 2.038792848587036, - "learning_rate": 3.3965804076578896e-06, - "loss": 0.1001, - "step": 710 - }, - { - "epoch": 3.8432432432432435, - "grad_norm": 2.6447055339813232, - "learning_rate": 3.392616129448039e-06, - "loss": 0.2788, - "step": 711 - }, - { - "epoch": 3.8486486486486484, - "grad_norm": 3.546876907348633, - "learning_rate": 3.3886492771648593e-06, - "loss": 0.2663, - "step": 712 - }, - { - "epoch": 3.854054054054054, - "grad_norm": 2.9587066173553467, - "learning_rate": 3.384679862247726e-06, - "loss": 0.3497, - "step": 713 - }, - { - "epoch": 3.8594594594594596, - "grad_norm": 3.7122113704681396, - "learning_rate": 3.3807078961434013e-06, - "loss": 0.3613, - "step": 714 - }, - { - "epoch": 3.864864864864865, - "grad_norm": 3.157294988632202, - "learning_rate": 3.376733390306004e-06, - "loss": 0.0783, - "step": 715 - }, - { - "epoch": 3.8702702702702703, - "grad_norm": 3.564279317855835, - "learning_rate": 3.372756356196979e-06, - "loss": 0.1617, - "step": 716 - }, - { - "epoch": 3.8756756756756756, - "grad_norm": 4.231864929199219, - "learning_rate": 3.3687768052850595e-06, - "loss": 0.6444, - "step": 717 - }, - { - "epoch": 3.881081081081081, - "grad_norm": 5.480365753173828, - "learning_rate": 3.364794749046239e-06, - "loss": 0.4858, - "step": 718 - }, - { - "epoch": 3.8864864864864863, - "grad_norm": 3.428140878677368, - "learning_rate": 3.3608101989637333e-06, - "loss": 0.3103, - "step": 719 - }, - { - "epoch": 3.891891891891892, - "grad_norm": 3.521989345550537, - "learning_rate": 3.356823166527952e-06, - "loss": 0.2501, - "step": 720 - }, - { - "epoch": 3.8972972972972975, - "grad_norm": 3.287081718444824, - "learning_rate": 3.352833663236463e-06, - "loss": 0.18, - "step": 721 - }, - { - "epoch": 3.902702702702703, - "grad_norm": 3.323146104812622, - "learning_rate": 3.348841700593956e-06, - "loss": 0.12, - "step": 722 - }, - { - "epoch": 3.908108108108108, - "grad_norm": 3.516693115234375, - "learning_rate": 3.3448472901122187e-06, - "loss": 0.2618, - "step": 723 - }, - { - "epoch": 3.9135135135135135, - "grad_norm": 3.8109545707702637, - "learning_rate": 3.340850443310092e-06, - "loss": 0.3689, - "step": 724 - }, - { - "epoch": 3.918918918918919, - "grad_norm": 3.8335933685302734, - "learning_rate": 3.336851171713447e-06, - "loss": 0.2195, - "step": 725 - }, - { - "epoch": 3.924324324324324, - "grad_norm": 3.9054670333862305, - "learning_rate": 3.3328494868551444e-06, - "loss": 0.2602, - "step": 726 - }, - { - "epoch": 3.92972972972973, - "grad_norm": 3.1380631923675537, - "learning_rate": 3.3288454002750046e-06, - "loss": 0.1561, - "step": 727 - }, - { - "epoch": 3.935135135135135, - "grad_norm": 4.304198741912842, - "learning_rate": 3.3248389235197764e-06, - "loss": 0.4469, - "step": 728 - }, - { - "epoch": 3.9405405405405407, - "grad_norm": 3.3321573734283447, - "learning_rate": 3.3208300681430967e-06, - "loss": 0.2246, - "step": 729 - }, - { - "epoch": 3.945945945945946, - "grad_norm": 3.89400315284729, - "learning_rate": 3.3168188457054656e-06, - "loss": 0.2743, - "step": 730 - }, - { - "epoch": 3.9513513513513514, - "grad_norm": 3.393209934234619, - "learning_rate": 3.312805267774209e-06, - "loss": 0.551, - "step": 731 - }, - { - "epoch": 3.9567567567567568, - "grad_norm": 3.711652994155884, - "learning_rate": 3.3087893459234423e-06, - "loss": 0.3522, - "step": 732 - }, - { - "epoch": 3.962162162162162, - "grad_norm": 3.6701200008392334, - "learning_rate": 3.304771091734043e-06, - "loss": 0.3084, - "step": 733 - }, - { - "epoch": 3.9675675675675675, - "grad_norm": 3.1742889881134033, - "learning_rate": 3.300750516793614e-06, - "loss": 0.3406, - "step": 734 - }, - { - "epoch": 3.972972972972973, - "grad_norm": 4.000397682189941, - "learning_rate": 3.2967276326964504e-06, - "loss": 0.3463, - "step": 735 - }, - { - "epoch": 3.9783783783783786, - "grad_norm": 3.7932708263397217, - "learning_rate": 3.2927024510435057e-06, - "loss": 0.3758, - "step": 736 - }, - { - "epoch": 3.983783783783784, - "grad_norm": 3.6258292198181152, - "learning_rate": 3.2886749834423587e-06, - "loss": 0.3328, - "step": 737 - }, - { - "epoch": 3.9891891891891893, - "grad_norm": 4.628194332122803, - "learning_rate": 3.284645241507183e-06, - "loss": 0.6213, - "step": 738 - }, - { - "epoch": 3.9945945945945946, - "grad_norm": 4.173697471618652, - "learning_rate": 3.280613236858707e-06, - "loss": 0.2463, - "step": 739 - }, - { - "epoch": 4.0, - "grad_norm": 2.9315719604492188, - "learning_rate": 3.2765789811241865e-06, - "loss": 0.3501, - "step": 740 - } - ], - "logging_steps": 1, - "max_steps": 1850, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.9969033700062003e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/chat_template.jinja b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/chat_template.jinja deleted file mode 100644 index 39bd0c9f7fe30aea14eda194fee17703da4a4dbf..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/chat_template.jinja +++ /dev/null @@ -1,5 +0,0 @@ -{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> - -'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> - -' }}{% endif %} \ No newline at end of file diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/config.json deleted file mode 100644 index ec5612543540085e09eed37e81b17ae51d1a6973..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": 128009, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "float32", - "transformers_version": "4.55.0", - "use_cache": false, - "vocab_size": 128256 -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/generation_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/generation_config.json deleted file mode 100644 index f53ccb516e57388491adda6b9950bcfa872e93ae..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/generation_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "_from_model_config": true, - "bos_token_id": 128000, - "eos_token_id": 128009, - "transformers_version": "4.55.0", - "use_cache": false -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00001-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00001-of-00007.safetensors deleted file mode 100644 index d97b8423307036577d01018f5542945453fe126a..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00001-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3799e59728718295ee8ab80098104906e712a7ede14cd3823d0ef32cbe14362d -size 4886466168 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00002-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00002-of-00007.safetensors deleted file mode 100644 index 03be5bfcd33844e4ffadc7c84d22cb174560fc22..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00002-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:02318ab8873ba2e678c0f6b03d45a9a6ab990bedda3b9cdc57f2fdde1aa3859a -size 4832007448 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00003-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00003-of-00007.safetensors deleted file mode 100644 index f6db29fecbfe1f399e3d73111c7d0eef37d117c1..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00003-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68e133be9bec120da4d1432970885d334dfbdab33e06215aca2023f3b4a9dbf9 -size 4999813112 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00004-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00004-of-00007.safetensors deleted file mode 100644 index 8f601e09b3cd226ca520848560c0b46d868931bd..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00004-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:43c7a03627e88191771e2ade876c0258def8f0d0028270c97fff88077e6ac970 -size 4999813128 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00005-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00005-of-00007.safetensors deleted file mode 100644 index 09fa6290b4de3f4493ea7222301b273f20a544a8..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00005-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ff199d2ac57e213762a56382806621aad99c847a76b84e801488336a0b260e3 -size 4832007496 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00006-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00006-of-00007.safetensors deleted file mode 100644 index 3524f9681e01c90a4cc28d90dcb3c560806ebb0c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00006-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d8355e95cb0ac75b5870095e952ff7a40a6290ffd4d195e51662ea7f38e74ed -size 4999813120 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00007-of-00007.safetensors b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00007-of-00007.safetensors deleted file mode 100644 index d6ee39d3ed40c2b4f049be9b0fe281dfc2e2c6f6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model-00007-of-00007.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89a8144dcf28f6a8a726f616117368c87c86c57390973ea6f3b55a069b80b53e -size 2571158184 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model.safetensors.index.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model.safetensors.index.json deleted file mode 100644 index 30d31d54f352f0c71ad48745af612a088822fa48..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/model.safetensors.index.json +++ /dev/null @@ -1,299 +0,0 @@ -{ - "metadata": { - "total_parameters": 2007565312, - "total_size": 32121044992 - }, - "weight_map": { - "lm_head.weight": "model-00007-of-00007.safetensors", - "model.embed_tokens.weight": "model-00001-of-00007.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", - "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", - "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", - "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", - "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", - "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", - "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", - "model.norm.weight": "model-00007-of-00007.safetensors" - } -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_0.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_0.pth deleted file mode 100644 index 3fb9a88bbbee1d828823dc0792895d385b4be47e..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c5e18f922d0af74d820247ae97bee506ab412554a58345ddf2558abc94ee3e3 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_1.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_1.pth deleted file mode 100644 index cc3d4a3c6ff4b588e0b24552f5cc78610d1a3f42..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a2dcca6d9741f46592359768ea2212b9321da6408d1fd7d3a80b017bf37f434 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_2.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_2.pth deleted file mode 100644 index 0ea7e83be3a9fc39999b7084bcf14ba0f491317b..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69420ece2c255923c5cbb3c6c9c4a6b9cb38fb57e5d3033c8b7d436a1faf6f13 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_3.pth b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_3.pth deleted file mode 100644 index 88e70a1e21ef6d40a7016a6221703385b6c1cdc6..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66f278b40a1e23b88a657c4e5d03afa8dbbbe14dfeb16f6b4beedaece6cdd0b9 -size 15024 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/scheduler.pt b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/scheduler.pt deleted file mode 100644 index 9cf3516994a67db9c7e368b039b8c1c6e062efb2..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5db8a3cadd9dfb5533b63ed4da0f06ff21ff40fc8598a60edc89cc2c5b94f10 -size 1064 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/special_tokens_map.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/special_tokens_map.json deleted file mode 100644 index 14daf4588e61b4e4983af0fccaba4d5500c0977c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/special_tokens_map.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "additional_special_tokens": [ - { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } - ], - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "pad_token": "<|eot_id|>" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer.json deleted file mode 100644 index 172311123ab62378f1f6d90f3068a676b7d939ed..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 -size 17210148 diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer_config.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer_config.json deleted file mode 100644 index 6739fcd129e717b71b64001dcb25a03c143d66f5..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/tokenizer_config.json +++ /dev/null @@ -1,2076 +0,0 @@ -{ - "added_tokens_decoder": { - "128000": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128001": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128002": { - "content": "<|reserved_special_token_0|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128003": { - "content": "<|reserved_special_token_1|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128004": { - "content": "<|reserved_special_token_2|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128005": { - "content": "<|reserved_special_token_3|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128006": { - "content": "<|start_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128007": { - "content": "<|end_header_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128008": { - "content": "<|reserved_special_token_4|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128009": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128010": { - "content": "<|reserved_special_token_5|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128011": { - "content": "<|reserved_special_token_6|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128012": { - "content": "<|reserved_special_token_7|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128013": { - "content": "<|reserved_special_token_8|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128014": { - "content": "<|reserved_special_token_9|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128015": { - "content": "<|reserved_special_token_10|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128016": { - "content": "<|reserved_special_token_11|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128017": { - "content": "<|reserved_special_token_12|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128018": { - "content": "<|reserved_special_token_13|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128019": { - "content": "<|reserved_special_token_14|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128020": { - "content": "<|reserved_special_token_15|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128021": { - "content": "<|reserved_special_token_16|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128022": { - "content": "<|reserved_special_token_17|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128023": { - "content": "<|reserved_special_token_18|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128024": { - "content": "<|reserved_special_token_19|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128025": { - "content": "<|reserved_special_token_20|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128026": { - "content": "<|reserved_special_token_21|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128027": { - "content": "<|reserved_special_token_22|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128028": { - "content": "<|reserved_special_token_23|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128029": { - "content": "<|reserved_special_token_24|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128030": { - "content": "<|reserved_special_token_25|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128031": { - "content": "<|reserved_special_token_26|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128032": { - "content": "<|reserved_special_token_27|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128033": { - "content": "<|reserved_special_token_28|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128034": { - "content": "<|reserved_special_token_29|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128035": { - "content": "<|reserved_special_token_30|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128036": { - "content": "<|reserved_special_token_31|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128037": { - "content": "<|reserved_special_token_32|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128038": { - "content": "<|reserved_special_token_33|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128039": { - "content": "<|reserved_special_token_34|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128040": { - "content": "<|reserved_special_token_35|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128041": { - "content": "<|reserved_special_token_36|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128042": { - "content": "<|reserved_special_token_37|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128043": { - "content": "<|reserved_special_token_38|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128044": { - "content": "<|reserved_special_token_39|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128045": { - "content": "<|reserved_special_token_40|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128046": { - "content": "<|reserved_special_token_41|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128047": { - "content": "<|reserved_special_token_42|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128048": { - "content": "<|reserved_special_token_43|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128049": { - "content": "<|reserved_special_token_44|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128050": { - "content": "<|reserved_special_token_45|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128051": { - "content": "<|reserved_special_token_46|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128052": { - "content": "<|reserved_special_token_47|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128053": { - "content": "<|reserved_special_token_48|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128054": { - "content": "<|reserved_special_token_49|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128055": { - "content": "<|reserved_special_token_50|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128056": { - "content": "<|reserved_special_token_51|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128057": { - "content": "<|reserved_special_token_52|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128058": { - "content": "<|reserved_special_token_53|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128059": { - "content": "<|reserved_special_token_54|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128060": { - "content": "<|reserved_special_token_55|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128061": { - "content": "<|reserved_special_token_56|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128062": { - "content": "<|reserved_special_token_57|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128063": { - "content": "<|reserved_special_token_58|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128064": { - "content": "<|reserved_special_token_59|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128065": { - "content": "<|reserved_special_token_60|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128066": { - "content": "<|reserved_special_token_61|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128067": { - "content": "<|reserved_special_token_62|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128068": { - "content": "<|reserved_special_token_63|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128069": { - "content": "<|reserved_special_token_64|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128070": { - "content": "<|reserved_special_token_65|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128071": { - "content": "<|reserved_special_token_66|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128072": { - "content": "<|reserved_special_token_67|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128073": { - "content": "<|reserved_special_token_68|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128074": { - "content": "<|reserved_special_token_69|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128075": { - "content": "<|reserved_special_token_70|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128076": { - "content": "<|reserved_special_token_71|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128077": { - "content": "<|reserved_special_token_72|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128078": { - "content": "<|reserved_special_token_73|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128079": { - "content": "<|reserved_special_token_74|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128080": { - "content": "<|reserved_special_token_75|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128081": { - "content": "<|reserved_special_token_76|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128082": { - "content": "<|reserved_special_token_77|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128083": { - "content": "<|reserved_special_token_78|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128084": { - "content": "<|reserved_special_token_79|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128085": { - "content": "<|reserved_special_token_80|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128086": { - "content": "<|reserved_special_token_81|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128087": { - "content": "<|reserved_special_token_82|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128088": { - "content": "<|reserved_special_token_83|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128089": { - "content": "<|reserved_special_token_84|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128090": { - "content": "<|reserved_special_token_85|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128091": { - "content": "<|reserved_special_token_86|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128092": { - "content": "<|reserved_special_token_87|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128093": { - "content": "<|reserved_special_token_88|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128094": { - "content": "<|reserved_special_token_89|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128095": { - "content": "<|reserved_special_token_90|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128096": { - "content": "<|reserved_special_token_91|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128097": { - "content": "<|reserved_special_token_92|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128098": { - "content": "<|reserved_special_token_93|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128099": { - "content": "<|reserved_special_token_94|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128100": { - "content": "<|reserved_special_token_95|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128101": { - "content": "<|reserved_special_token_96|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128102": { - "content": "<|reserved_special_token_97|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128103": { - "content": "<|reserved_special_token_98|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128104": { - "content": "<|reserved_special_token_99|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128105": { - "content": "<|reserved_special_token_100|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128106": { - "content": "<|reserved_special_token_101|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128107": { - "content": "<|reserved_special_token_102|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128108": { - "content": "<|reserved_special_token_103|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128109": { - "content": "<|reserved_special_token_104|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128110": { - "content": "<|reserved_special_token_105|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128111": { - "content": "<|reserved_special_token_106|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128112": { - "content": "<|reserved_special_token_107|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128113": { - "content": "<|reserved_special_token_108|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128114": { - "content": "<|reserved_special_token_109|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128115": { - "content": "<|reserved_special_token_110|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128116": { - "content": "<|reserved_special_token_111|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128117": { - "content": "<|reserved_special_token_112|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128118": { - "content": "<|reserved_special_token_113|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128119": { - "content": "<|reserved_special_token_114|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128120": { - "content": "<|reserved_special_token_115|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128121": { - "content": "<|reserved_special_token_116|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128122": { - "content": "<|reserved_special_token_117|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128123": { - "content": "<|reserved_special_token_118|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128124": { - "content": "<|reserved_special_token_119|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128125": { - "content": "<|reserved_special_token_120|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128126": { - "content": "<|reserved_special_token_121|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128127": { - "content": "<|reserved_special_token_122|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128128": { - "content": "<|reserved_special_token_123|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128129": { - "content": "<|reserved_special_token_124|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128130": { - "content": "<|reserved_special_token_125|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128131": { - "content": "<|reserved_special_token_126|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128132": { - "content": "<|reserved_special_token_127|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128133": { - "content": "<|reserved_special_token_128|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128134": { - "content": "<|reserved_special_token_129|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128135": { - "content": "<|reserved_special_token_130|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128136": { - "content": "<|reserved_special_token_131|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128137": { - "content": "<|reserved_special_token_132|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128138": { - "content": "<|reserved_special_token_133|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128139": { - "content": "<|reserved_special_token_134|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128140": { - "content": "<|reserved_special_token_135|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128141": { - "content": "<|reserved_special_token_136|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128142": { - "content": "<|reserved_special_token_137|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128143": { - "content": "<|reserved_special_token_138|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128144": { - "content": "<|reserved_special_token_139|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128145": { - "content": "<|reserved_special_token_140|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128146": { - "content": "<|reserved_special_token_141|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128147": { - "content": "<|reserved_special_token_142|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128148": { - "content": "<|reserved_special_token_143|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128149": { - "content": "<|reserved_special_token_144|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128150": { - "content": "<|reserved_special_token_145|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128151": { - "content": "<|reserved_special_token_146|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128152": { - "content": "<|reserved_special_token_147|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128153": { - "content": "<|reserved_special_token_148|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128154": { - "content": "<|reserved_special_token_149|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128155": { - "content": "<|reserved_special_token_150|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128156": { - "content": "<|reserved_special_token_151|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128157": { - "content": "<|reserved_special_token_152|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128158": { - "content": "<|reserved_special_token_153|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128159": { - "content": "<|reserved_special_token_154|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128160": { - "content": "<|reserved_special_token_155|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128161": { - "content": "<|reserved_special_token_156|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128162": { - "content": "<|reserved_special_token_157|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128163": { - "content": "<|reserved_special_token_158|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128164": { - "content": "<|reserved_special_token_159|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128165": { - "content": "<|reserved_special_token_160|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128166": { - "content": "<|reserved_special_token_161|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128167": { - "content": "<|reserved_special_token_162|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128168": { - "content": "<|reserved_special_token_163|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128169": { - "content": "<|reserved_special_token_164|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128170": { - "content": "<|reserved_special_token_165|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128171": { - "content": "<|reserved_special_token_166|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128172": { - "content": "<|reserved_special_token_167|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128173": { - "content": "<|reserved_special_token_168|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128174": { - "content": "<|reserved_special_token_169|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128175": { - "content": "<|reserved_special_token_170|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128176": { - "content": "<|reserved_special_token_171|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128177": { - "content": "<|reserved_special_token_172|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128178": { - "content": "<|reserved_special_token_173|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128179": { - "content": "<|reserved_special_token_174|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128180": { - "content": "<|reserved_special_token_175|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128181": { - "content": "<|reserved_special_token_176|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128182": { - "content": "<|reserved_special_token_177|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128183": { - "content": "<|reserved_special_token_178|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128184": { - "content": "<|reserved_special_token_179|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128185": { - "content": "<|reserved_special_token_180|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128186": { - "content": "<|reserved_special_token_181|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128187": { - "content": "<|reserved_special_token_182|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128188": { - "content": "<|reserved_special_token_183|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128189": { - "content": "<|reserved_special_token_184|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128190": { - "content": "<|reserved_special_token_185|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128191": { - "content": "<|reserved_special_token_186|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128192": { - "content": "<|reserved_special_token_187|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128193": { - "content": "<|reserved_special_token_188|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128194": { - "content": "<|reserved_special_token_189|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128195": { - "content": "<|reserved_special_token_190|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128196": { - "content": "<|reserved_special_token_191|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128197": { - "content": "<|reserved_special_token_192|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128198": { - "content": "<|reserved_special_token_193|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128199": { - "content": "<|reserved_special_token_194|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128200": { - "content": "<|reserved_special_token_195|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128201": { - "content": "<|reserved_special_token_196|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128202": { - "content": "<|reserved_special_token_197|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128203": { - "content": "<|reserved_special_token_198|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128204": { - "content": "<|reserved_special_token_199|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128205": { - "content": "<|reserved_special_token_200|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128206": { - "content": "<|reserved_special_token_201|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128207": { - "content": "<|reserved_special_token_202|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128208": { - "content": "<|reserved_special_token_203|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128209": { - "content": "<|reserved_special_token_204|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128210": { - "content": "<|reserved_special_token_205|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128211": { - "content": "<|reserved_special_token_206|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128212": { - "content": "<|reserved_special_token_207|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128213": { - "content": "<|reserved_special_token_208|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128214": { - "content": "<|reserved_special_token_209|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128215": { - "content": "<|reserved_special_token_210|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128216": { - "content": "<|reserved_special_token_211|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128217": { - "content": "<|reserved_special_token_212|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128218": { - "content": "<|reserved_special_token_213|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128219": { - "content": "<|reserved_special_token_214|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128220": { - "content": "<|reserved_special_token_215|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128221": { - "content": "<|reserved_special_token_216|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128222": { - "content": "<|reserved_special_token_217|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128223": { - "content": "<|reserved_special_token_218|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128224": { - "content": "<|reserved_special_token_219|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128225": { - "content": "<|reserved_special_token_220|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128226": { - "content": "<|reserved_special_token_221|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128227": { - "content": "<|reserved_special_token_222|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128228": { - "content": "<|reserved_special_token_223|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128229": { - "content": "<|reserved_special_token_224|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128230": { - "content": "<|reserved_special_token_225|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128231": { - "content": "<|reserved_special_token_226|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128232": { - "content": "<|reserved_special_token_227|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128233": { - "content": "<|reserved_special_token_228|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128234": { - "content": "<|reserved_special_token_229|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128235": { - "content": "<|reserved_special_token_230|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128236": { - "content": "<|reserved_special_token_231|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128237": { - "content": "<|reserved_special_token_232|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128238": { - "content": "<|reserved_special_token_233|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128239": { - "content": "<|reserved_special_token_234|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128240": { - "content": "<|reserved_special_token_235|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128241": { - "content": "<|reserved_special_token_236|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128242": { - "content": "<|reserved_special_token_237|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128243": { - "content": "<|reserved_special_token_238|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128244": { - "content": "<|reserved_special_token_239|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128245": { - "content": "<|reserved_special_token_240|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128246": { - "content": "<|reserved_special_token_241|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128247": { - "content": "<|reserved_special_token_242|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128248": { - "content": "<|reserved_special_token_243|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128249": { - "content": "<|reserved_special_token_244|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128250": { - "content": "<|reserved_special_token_245|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128251": { - "content": "<|reserved_special_token_246|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128252": { - "content": "<|reserved_special_token_247|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128253": { - "content": "<|reserved_special_token_248|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128254": { - "content": "<|reserved_special_token_249|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128255": { - "content": "<|reserved_special_token_250|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - }, - "128256": { - "content": "<|eom_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false, - "special": true - } - }, - "additional_special_tokens": [ - "<|eom_id|>" - ], - "bos_token": "<|begin_of_text|>", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 1000000000000000019884624838656, - "pad_token": "<|eot_id|>", - "padding_side": "right", - "split_special_tokens": false, - "tokenizer_class": "PreTrainedTokenizerFast" -} diff --git a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/trainer_state.json b/metallama3_8b/limo_filtered_incorrect/checkpoint-925/trainer_state.json deleted file mode 100644 index bcff23dca6783edd2fd5334ab6ca46456e43ff6c..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/checkpoint-925/trainer_state.json +++ /dev/null @@ -1,6509 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 5.0, - "eval_steps": 500, - "global_step": 925, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.005405405405405406, - "grad_norm": 72.60939025878906, - "learning_rate": 5e-06, - "loss": 2.9165, - "step": 1 - }, - { - "epoch": 0.010810810810810811, - "grad_norm": 29.01830291748047, - "learning_rate": 4.999996395324314e-06, - "loss": 1.9314, - "step": 2 - }, - { - "epoch": 0.016216216216216217, - "grad_norm": 21.44908332824707, - "learning_rate": 4.99998558130765e-06, - "loss": 1.5709, - "step": 3 - }, - { - "epoch": 0.021621621621621623, - "grad_norm": 4.490907669067383, - "learning_rate": 4.999967557981192e-06, - "loss": 0.8099, - "step": 4 - }, - { - "epoch": 0.02702702702702703, - "grad_norm": 4.000796794891357, - "learning_rate": 4.999942325396917e-06, - "loss": 0.9021, - "step": 5 - }, - { - "epoch": 0.032432432432432434, - "grad_norm": 18.513282775878906, - "learning_rate": 4.999909883627588e-06, - "loss": 1.7972, - "step": 6 - }, - { - "epoch": 0.03783783783783784, - "grad_norm": 3.5735981464385986, - "learning_rate": 4.999870232766757e-06, - "loss": 1.4306, - "step": 7 - }, - { - "epoch": 0.043243243243243246, - "grad_norm": 3.1145193576812744, - "learning_rate": 4.9998233729287696e-06, - "loss": 1.051, - "step": 8 - }, - { - "epoch": 0.04864864864864865, - "grad_norm": 3.856376886367798, - "learning_rate": 4.999769304248755e-06, - "loss": 0.8089, - "step": 9 - }, - { - "epoch": 0.05405405405405406, - "grad_norm": 4.05589485168457, - "learning_rate": 4.9997080268826344e-06, - "loss": 1.0999, - "step": 10 - }, - { - "epoch": 0.05945945945945946, - "grad_norm": 13.784229278564453, - "learning_rate": 4.9996395410071165e-06, - "loss": 1.2831, - "step": 11 - }, - { - "epoch": 0.06486486486486487, - "grad_norm": 6.079237937927246, - "learning_rate": 4.999563846819696e-06, - "loss": 1.2874, - "step": 12 - }, - { - "epoch": 0.07027027027027027, - "grad_norm": 4.5971245765686035, - "learning_rate": 4.999480944538655e-06, - "loss": 0.96, - "step": 13 - }, - { - "epoch": 0.07567567567567568, - "grad_norm": 4.916017532348633, - "learning_rate": 4.999390834403063e-06, - "loss": 0.9869, - "step": 14 - }, - { - "epoch": 0.08108108108108109, - "grad_norm": 3.2311055660247803, - "learning_rate": 4.999293516672773e-06, - "loss": 0.9293, - "step": 15 - }, - { - "epoch": 0.08648648648648649, - "grad_norm": 3.3040921688079834, - "learning_rate": 4.9991889916284255e-06, - "loss": 0.8914, - "step": 16 - }, - { - "epoch": 0.0918918918918919, - "grad_norm": 3.794267416000366, - "learning_rate": 4.999077259571442e-06, - "loss": 1.0176, - "step": 17 - }, - { - "epoch": 0.0972972972972973, - "grad_norm": 4.788509845733643, - "learning_rate": 4.998958320824031e-06, - "loss": 1.0259, - "step": 18 - }, - { - "epoch": 0.10270270270270271, - "grad_norm": 10.027527809143066, - "learning_rate": 4.998832175729179e-06, - "loss": 1.3356, - "step": 19 - }, - { - "epoch": 0.10810810810810811, - "grad_norm": 4.612483978271484, - "learning_rate": 4.998698824650656e-06, - "loss": 1.4486, - "step": 20 - }, - { - "epoch": 0.11351351351351352, - "grad_norm": 3.8676936626434326, - "learning_rate": 4.998558267973014e-06, - "loss": 0.8372, - "step": 21 - }, - { - "epoch": 0.11891891891891893, - "grad_norm": 2.9611001014709473, - "learning_rate": 4.998410506101579e-06, - "loss": 0.7931, - "step": 22 - }, - { - "epoch": 0.12432432432432433, - "grad_norm": 5.508745193481445, - "learning_rate": 4.9982555394624595e-06, - "loss": 1.3022, - "step": 23 - }, - { - "epoch": 0.12972972972972974, - "grad_norm": 3.434845209121704, - "learning_rate": 4.998093368502539e-06, - "loss": 0.9739, - "step": 24 - }, - { - "epoch": 0.13513513513513514, - "grad_norm": 4.736802101135254, - "learning_rate": 4.9979239936894765e-06, - "loss": 1.1154, - "step": 25 - }, - { - "epoch": 0.14054054054054055, - "grad_norm": 3.69411039352417, - "learning_rate": 4.997747415511705e-06, - "loss": 0.7543, - "step": 26 - }, - { - "epoch": 0.14594594594594595, - "grad_norm": 2.8646645545959473, - "learning_rate": 4.997563634478428e-06, - "loss": 0.7278, - "step": 27 - }, - { - "epoch": 0.15135135135135136, - "grad_norm": 6.56904935836792, - "learning_rate": 4.997372651119626e-06, - "loss": 0.8167, - "step": 28 - }, - { - "epoch": 0.15675675675675677, - "grad_norm": 2.955914258956909, - "learning_rate": 4.997174465986044e-06, - "loss": 0.8031, - "step": 29 - }, - { - "epoch": 0.16216216216216217, - "grad_norm": 2.5714259147644043, - "learning_rate": 4.996969079649196e-06, - "loss": 0.689, - "step": 30 - }, - { - "epoch": 0.16756756756756758, - "grad_norm": 3.5165364742279053, - "learning_rate": 4.996756492701362e-06, - "loss": 0.8059, - "step": 31 - }, - { - "epoch": 0.17297297297297298, - "grad_norm": 3.2861921787261963, - "learning_rate": 4.996536705755591e-06, - "loss": 0.9658, - "step": 32 - }, - { - "epoch": 0.1783783783783784, - "grad_norm": 2.962470531463623, - "learning_rate": 4.996309719445687e-06, - "loss": 0.8349, - "step": 33 - }, - { - "epoch": 0.1837837837837838, - "grad_norm": 2.7694804668426514, - "learning_rate": 4.996075534426223e-06, - "loss": 0.8287, - "step": 34 - }, - { - "epoch": 0.1891891891891892, - "grad_norm": 3.405071258544922, - "learning_rate": 4.995834151372526e-06, - "loss": 1.1211, - "step": 35 - }, - { - "epoch": 0.1945945945945946, - "grad_norm": 2.8680710792541504, - "learning_rate": 4.995585570980685e-06, - "loss": 1.0841, - "step": 36 - }, - { - "epoch": 0.2, - "grad_norm": 3.341021776199341, - "learning_rate": 4.995329793967537e-06, - "loss": 0.6182, - "step": 37 - }, - { - "epoch": 0.20540540540540542, - "grad_norm": 3.0639379024505615, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.7647, - "step": 38 - }, - { - "epoch": 0.21081081081081082, - "grad_norm": 3.225759983062744, - "learning_rate": 4.994796653048457e-06, - "loss": 0.8691, - "step": 39 - }, - { - "epoch": 0.21621621621621623, - "grad_norm": 4.56926155090332, - "learning_rate": 4.994519290679965e-06, - "loss": 1.0404, - "step": 40 - }, - { - "epoch": 0.22162162162162163, - "grad_norm": 4.871571063995361, - "learning_rate": 4.994234734765043e-06, - "loss": 1.1877, - "step": 41 - }, - { - "epoch": 0.22702702702702704, - "grad_norm": 3.672215700149536, - "learning_rate": 4.993942986124278e-06, - "loss": 0.959, - "step": 42 - }, - { - "epoch": 0.23243243243243245, - "grad_norm": 3.184683322906494, - "learning_rate": 4.9936440455989975e-06, - "loss": 0.9249, - "step": 43 - }, - { - "epoch": 0.23783783783783785, - "grad_norm": 2.7092034816741943, - "learning_rate": 4.993337914051266e-06, - "loss": 0.6899, - "step": 44 - }, - { - "epoch": 0.24324324324324326, - "grad_norm": 3.153764486312866, - "learning_rate": 4.99302459236389e-06, - "loss": 0.9075, - "step": 45 - }, - { - "epoch": 0.24864864864864866, - "grad_norm": 3.3629748821258545, - "learning_rate": 4.992704081440407e-06, - "loss": 0.785, - "step": 46 - }, - { - "epoch": 0.25405405405405407, - "grad_norm": 4.478365898132324, - "learning_rate": 4.992376382205088e-06, - "loss": 1.008, - "step": 47 - }, - { - "epoch": 0.2594594594594595, - "grad_norm": 3.4001641273498535, - "learning_rate": 4.992041495602932e-06, - "loss": 0.7751, - "step": 48 - }, - { - "epoch": 0.2648648648648649, - "grad_norm": 2.522662878036499, - "learning_rate": 4.991699422599664e-06, - "loss": 0.9022, - "step": 49 - }, - { - "epoch": 0.2702702702702703, - "grad_norm": 2.764458179473877, - "learning_rate": 4.991350164181735e-06, - "loss": 0.8801, - "step": 50 - }, - { - "epoch": 0.2756756756756757, - "grad_norm": 2.814859628677368, - "learning_rate": 4.990993721356317e-06, - "loss": 0.7045, - "step": 51 - }, - { - "epoch": 0.2810810810810811, - "grad_norm": 2.441311836242676, - "learning_rate": 4.990630095151296e-06, - "loss": 0.7312, - "step": 52 - }, - { - "epoch": 0.2864864864864865, - "grad_norm": 2.4443013668060303, - "learning_rate": 4.9902592866152765e-06, - "loss": 0.9609, - "step": 53 - }, - { - "epoch": 0.2918918918918919, - "grad_norm": 2.2934701442718506, - "learning_rate": 4.989881296817575e-06, - "loss": 0.5753, - "step": 54 - }, - { - "epoch": 0.2972972972972973, - "grad_norm": 2.6286847591400146, - "learning_rate": 4.989496126848215e-06, - "loss": 0.5118, - "step": 55 - }, - { - "epoch": 0.3027027027027027, - "grad_norm": 3.6817069053649902, - "learning_rate": 4.989103777817928e-06, - "loss": 1.1261, - "step": 56 - }, - { - "epoch": 0.3081081081081081, - "grad_norm": 3.011197566986084, - "learning_rate": 4.988704250858145e-06, - "loss": 0.7823, - "step": 57 - }, - { - "epoch": 0.31351351351351353, - "grad_norm": 2.5490806102752686, - "learning_rate": 4.988297547121e-06, - "loss": 0.6019, - "step": 58 - }, - { - "epoch": 0.31891891891891894, - "grad_norm": 3.0803146362304688, - "learning_rate": 4.98788366777932e-06, - "loss": 0.825, - "step": 59 - }, - { - "epoch": 0.32432432432432434, - "grad_norm": 3.015730619430542, - "learning_rate": 4.987462614026625e-06, - "loss": 0.7667, - "step": 60 - }, - { - "epoch": 0.32972972972972975, - "grad_norm": 2.5371594429016113, - "learning_rate": 4.987034387077126e-06, - "loss": 0.8051, - "step": 61 - }, - { - "epoch": 0.33513513513513515, - "grad_norm": 2.6414010524749756, - "learning_rate": 4.986598988165718e-06, - "loss": 0.6895, - "step": 62 - }, - { - "epoch": 0.34054054054054056, - "grad_norm": 3.065131187438965, - "learning_rate": 4.9861564185479785e-06, - "loss": 0.9268, - "step": 63 - }, - { - "epoch": 0.34594594594594597, - "grad_norm": 2.5708694458007812, - "learning_rate": 4.985706679500163e-06, - "loss": 0.9854, - "step": 64 - }, - { - "epoch": 0.35135135135135137, - "grad_norm": 2.768915891647339, - "learning_rate": 4.9852497723192025e-06, - "loss": 0.8083, - "step": 65 - }, - { - "epoch": 0.3567567567567568, - "grad_norm": 2.567901849746704, - "learning_rate": 4.9847856983227e-06, - "loss": 0.9098, - "step": 66 - }, - { - "epoch": 0.3621621621621622, - "grad_norm": 2.5766549110412598, - "learning_rate": 4.984314458848923e-06, - "loss": 0.8881, - "step": 67 - }, - { - "epoch": 0.3675675675675676, - "grad_norm": 2.9778389930725098, - "learning_rate": 4.983836055256804e-06, - "loss": 0.9877, - "step": 68 - }, - { - "epoch": 0.372972972972973, - "grad_norm": 2.7225165367126465, - "learning_rate": 4.983350488925935e-06, - "loss": 0.8282, - "step": 69 - }, - { - "epoch": 0.3783783783783784, - "grad_norm": 2.702287197113037, - "learning_rate": 4.982857761256564e-06, - "loss": 1.1756, - "step": 70 - }, - { - "epoch": 0.3837837837837838, - "grad_norm": 2.9815568923950195, - "learning_rate": 4.982357873669589e-06, - "loss": 0.8114, - "step": 71 - }, - { - "epoch": 0.3891891891891892, - "grad_norm": 3.27150297164917, - "learning_rate": 4.981850827606556e-06, - "loss": 0.6763, - "step": 72 - }, - { - "epoch": 0.3945945945945946, - "grad_norm": 2.568423271179199, - "learning_rate": 4.981336624529655e-06, - "loss": 0.9372, - "step": 73 - }, - { - "epoch": 0.4, - "grad_norm": 2.621175527572632, - "learning_rate": 4.980815265921714e-06, - "loss": 1.0155, - "step": 74 - }, - { - "epoch": 0.40540540540540543, - "grad_norm": 2.62827205657959, - "learning_rate": 4.980286753286196e-06, - "loss": 0.949, - "step": 75 - }, - { - "epoch": 0.41081081081081083, - "grad_norm": 2.9462146759033203, - "learning_rate": 4.979751088147192e-06, - "loss": 1.0134, - "step": 76 - }, - { - "epoch": 0.41621621621621624, - "grad_norm": 2.814852714538574, - "learning_rate": 4.979208272049425e-06, - "loss": 0.9722, - "step": 77 - }, - { - "epoch": 0.42162162162162165, - "grad_norm": 4.177679538726807, - "learning_rate": 4.978658306558235e-06, - "loss": 1.2259, - "step": 78 - }, - { - "epoch": 0.42702702702702705, - "grad_norm": 2.813084125518799, - "learning_rate": 4.978101193259578e-06, - "loss": 0.834, - "step": 79 - }, - { - "epoch": 0.43243243243243246, - "grad_norm": 2.71824049949646, - "learning_rate": 4.977536933760025e-06, - "loss": 0.6151, - "step": 80 - }, - { - "epoch": 0.43783783783783786, - "grad_norm": 4.992153167724609, - "learning_rate": 4.976965529686755e-06, - "loss": 1.0475, - "step": 81 - }, - { - "epoch": 0.44324324324324327, - "grad_norm": 2.4810822010040283, - "learning_rate": 4.976386982687548e-06, - "loss": 0.8324, - "step": 82 - }, - { - "epoch": 0.4486486486486487, - "grad_norm": 4.509149074554443, - "learning_rate": 4.9758012944307845e-06, - "loss": 0.997, - "step": 83 - }, - { - "epoch": 0.4540540540540541, - "grad_norm": 3.114325761795044, - "learning_rate": 4.975208466605436e-06, - "loss": 1.2024, - "step": 84 - }, - { - "epoch": 0.4594594594594595, - "grad_norm": 3.297091007232666, - "learning_rate": 4.974608500921064e-06, - "loss": 0.9146, - "step": 85 - }, - { - "epoch": 0.4648648648648649, - "grad_norm": 2.824475049972534, - "learning_rate": 4.974001399107816e-06, - "loss": 0.7181, - "step": 86 - }, - { - "epoch": 0.4702702702702703, - "grad_norm": 20.262290954589844, - "learning_rate": 4.973387162916415e-06, - "loss": 0.8599, - "step": 87 - }, - { - "epoch": 0.4756756756756757, - "grad_norm": 4.015744686126709, - "learning_rate": 4.972765794118158e-06, - "loss": 0.6081, - "step": 88 - }, - { - "epoch": 0.4810810810810811, - "grad_norm": 2.8033058643341064, - "learning_rate": 4.9721372945049114e-06, - "loss": 0.8764, - "step": 89 - }, - { - "epoch": 0.4864864864864865, - "grad_norm": 5.271846294403076, - "learning_rate": 4.971501665889107e-06, - "loss": 0.8622, - "step": 90 - }, - { - "epoch": 0.4918918918918919, - "grad_norm": 2.557264804840088, - "learning_rate": 4.9708589101037306e-06, - "loss": 0.5523, - "step": 91 - }, - { - "epoch": 0.4972972972972973, - "grad_norm": 4.342173099517822, - "learning_rate": 4.970209029002325e-06, - "loss": 0.8922, - "step": 92 - }, - { - "epoch": 0.5027027027027027, - "grad_norm": 2.950364351272583, - "learning_rate": 4.969552024458977e-06, - "loss": 0.9455, - "step": 93 - }, - { - "epoch": 0.5081081081081081, - "grad_norm": 2.6453042030334473, - "learning_rate": 4.968887898368318e-06, - "loss": 0.8342, - "step": 94 - }, - { - "epoch": 0.5135135135135135, - "grad_norm": 3.486766815185547, - "learning_rate": 4.968216652645515e-06, - "loss": 0.8476, - "step": 95 - }, - { - "epoch": 0.518918918918919, - "grad_norm": 2.884152889251709, - "learning_rate": 4.967538289226268e-06, - "loss": 0.8879, - "step": 96 - }, - { - "epoch": 0.5243243243243243, - "grad_norm": 2.4130594730377197, - "learning_rate": 4.966852810066798e-06, - "loss": 0.7114, - "step": 97 - }, - { - "epoch": 0.5297297297297298, - "grad_norm": 3.182410955429077, - "learning_rate": 4.9661602171438524e-06, - "loss": 0.6757, - "step": 98 - }, - { - "epoch": 0.5351351351351351, - "grad_norm": 2.5027542114257812, - "learning_rate": 4.965460512454687e-06, - "loss": 0.8029, - "step": 99 - }, - { - "epoch": 0.5405405405405406, - "grad_norm": 2.3096024990081787, - "learning_rate": 4.964753698017071e-06, - "loss": 0.842, - "step": 100 - }, - { - "epoch": 0.5459459459459459, - "grad_norm": 2.875657081604004, - "learning_rate": 4.964039775869271e-06, - "loss": 0.6339, - "step": 101 - }, - { - "epoch": 0.5513513513513514, - "grad_norm": 2.505406141281128, - "learning_rate": 4.963318748070056e-06, - "loss": 0.7743, - "step": 102 - }, - { - "epoch": 0.5567567567567567, - "grad_norm": 3.552562713623047, - "learning_rate": 4.9625906166986815e-06, - "loss": 0.926, - "step": 103 - }, - { - "epoch": 0.5621621621621622, - "grad_norm": 2.717942476272583, - "learning_rate": 4.961855383854889e-06, - "loss": 0.7037, - "step": 104 - }, - { - "epoch": 0.5675675675675675, - "grad_norm": 2.5049386024475098, - "learning_rate": 4.961113051658901e-06, - "loss": 0.561, - "step": 105 - }, - { - "epoch": 0.572972972972973, - "grad_norm": 2.3112900257110596, - "learning_rate": 4.96036362225141e-06, - "loss": 0.7316, - "step": 106 - }, - { - "epoch": 0.5783783783783784, - "grad_norm": 2.470257520675659, - "learning_rate": 4.959607097793575e-06, - "loss": 0.6426, - "step": 107 - }, - { - "epoch": 0.5837837837837838, - "grad_norm": 3.8040788173675537, - "learning_rate": 4.9588434804670176e-06, - "loss": 1.0044, - "step": 108 - }, - { - "epoch": 0.5891891891891892, - "grad_norm": 3.143547296524048, - "learning_rate": 4.958072772473812e-06, - "loss": 0.9219, - "step": 109 - }, - { - "epoch": 0.5945945945945946, - "grad_norm": 3.5052590370178223, - "learning_rate": 4.9572949760364795e-06, - "loss": 0.6056, - "step": 110 - }, - { - "epoch": 0.6, - "grad_norm": 3.064009428024292, - "learning_rate": 4.9565100933979835e-06, - "loss": 0.6346, - "step": 111 - }, - { - "epoch": 0.6054054054054054, - "grad_norm": 2.694610595703125, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.9856, - "step": 112 - }, - { - "epoch": 0.6108108108108108, - "grad_norm": 2.5885775089263916, - "learning_rate": 4.954919078591521e-06, - "loss": 0.8669, - "step": 113 - }, - { - "epoch": 0.6162162162162163, - "grad_norm": 2.593609571456909, - "learning_rate": 4.954112951011628e-06, - "loss": 0.7201, - "step": 114 - }, - { - "epoch": 0.6216216216216216, - "grad_norm": 3.3045759201049805, - "learning_rate": 4.9532997464067065e-06, - "loss": 0.9095, - "step": 115 - }, - { - "epoch": 0.6270270270270271, - "grad_norm": 2.8144869804382324, - "learning_rate": 4.952479467121828e-06, - "loss": 1.0213, - "step": 116 - }, - { - "epoch": 0.6324324324324324, - "grad_norm": 2.5460312366485596, - "learning_rate": 4.951652115522463e-06, - "loss": 1.1154, - "step": 117 - }, - { - "epoch": 0.6378378378378379, - "grad_norm": 2.795137405395508, - "learning_rate": 4.950817693994481e-06, - "loss": 0.691, - "step": 118 - }, - { - "epoch": 0.6432432432432432, - "grad_norm": 2.4979195594787598, - "learning_rate": 4.949976204944135e-06, - "loss": 0.7224, - "step": 119 - }, - { - "epoch": 0.6486486486486487, - "grad_norm": 3.3131983280181885, - "learning_rate": 4.949127650798063e-06, - "loss": 0.9256, - "step": 120 - }, - { - "epoch": 0.654054054054054, - "grad_norm": 2.9060285091400146, - "learning_rate": 4.948272034003275e-06, - "loss": 0.6892, - "step": 121 - }, - { - "epoch": 0.6594594594594595, - "grad_norm": 3.695594549179077, - "learning_rate": 4.947409357027148e-06, - "loss": 0.5878, - "step": 122 - }, - { - "epoch": 0.6648648648648648, - "grad_norm": 3.1250460147857666, - "learning_rate": 4.9465396223574165e-06, - "loss": 0.9904, - "step": 123 - }, - { - "epoch": 0.6702702702702703, - "grad_norm": 4.024891376495361, - "learning_rate": 4.945662832502172e-06, - "loss": 1.1592, - "step": 124 - }, - { - "epoch": 0.6756756756756757, - "grad_norm": 2.6886494159698486, - "learning_rate": 4.944778989989847e-06, - "loss": 1.0041, - "step": 125 - }, - { - "epoch": 0.6810810810810811, - "grad_norm": 2.366912841796875, - "learning_rate": 4.943888097369216e-06, - "loss": 0.7045, - "step": 126 - }, - { - "epoch": 0.6864864864864865, - "grad_norm": 2.394932270050049, - "learning_rate": 4.942990157209381e-06, - "loss": 0.6685, - "step": 127 - }, - { - "epoch": 0.6918918918918919, - "grad_norm": 2.61933970451355, - "learning_rate": 4.9420851720997674e-06, - "loss": 0.8812, - "step": 128 - }, - { - "epoch": 0.6972972972972973, - "grad_norm": 2.7395646572113037, - "learning_rate": 4.94117314465012e-06, - "loss": 1.3014, - "step": 129 - }, - { - "epoch": 0.7027027027027027, - "grad_norm": 3.065484046936035, - "learning_rate": 4.940254077490487e-06, - "loss": 0.6978, - "step": 130 - }, - { - "epoch": 0.7081081081081081, - "grad_norm": 2.895038366317749, - "learning_rate": 4.939327973271222e-06, - "loss": 0.6249, - "step": 131 - }, - { - "epoch": 0.7135135135135136, - "grad_norm": 3.1773312091827393, - "learning_rate": 4.9383948346629665e-06, - "loss": 0.6423, - "step": 132 - }, - { - "epoch": 0.7189189189189189, - "grad_norm": 2.2378008365631104, - "learning_rate": 4.937454664356652e-06, - "loss": 0.7193, - "step": 133 - }, - { - "epoch": 0.7243243243243244, - "grad_norm": 2.5673701763153076, - "learning_rate": 4.9365074650634855e-06, - "loss": 0.7065, - "step": 134 - }, - { - "epoch": 0.7297297297297297, - "grad_norm": 2.7348387241363525, - "learning_rate": 4.9355532395149445e-06, - "loss": 1.0046, - "step": 135 - }, - { - "epoch": 0.7351351351351352, - "grad_norm": 2.391741991043091, - "learning_rate": 4.9345919904627655e-06, - "loss": 0.6771, - "step": 136 - }, - { - "epoch": 0.7405405405405405, - "grad_norm": 2.2096705436706543, - "learning_rate": 4.933623720678944e-06, - "loss": 0.6589, - "step": 137 - }, - { - "epoch": 0.745945945945946, - "grad_norm": 3.0840072631835938, - "learning_rate": 4.932648432955718e-06, - "loss": 0.8755, - "step": 138 - }, - { - "epoch": 0.7513513513513513, - "grad_norm": 2.4970428943634033, - "learning_rate": 4.931666130105564e-06, - "loss": 0.6685, - "step": 139 - }, - { - "epoch": 0.7567567567567568, - "grad_norm": 4.315455436706543, - "learning_rate": 4.930676814961189e-06, - "loss": 0.8101, - "step": 140 - }, - { - "epoch": 0.7621621621621621, - "grad_norm": 5.388065814971924, - "learning_rate": 4.92968049037552e-06, - "loss": 0.8193, - "step": 141 - }, - { - "epoch": 0.7675675675675676, - "grad_norm": 2.6107139587402344, - "learning_rate": 4.9286771592217005e-06, - "loss": 0.7852, - "step": 142 - }, - { - "epoch": 0.772972972972973, - "grad_norm": 3.936556577682495, - "learning_rate": 4.927666824393076e-06, - "loss": 1.0388, - "step": 143 - }, - { - "epoch": 0.7783783783783784, - "grad_norm": 2.74424409866333, - "learning_rate": 4.926649488803191e-06, - "loss": 0.8266, - "step": 144 - }, - { - "epoch": 0.7837837837837838, - "grad_norm": 2.8998451232910156, - "learning_rate": 4.925625155385776e-06, - "loss": 0.4895, - "step": 145 - }, - { - "epoch": 0.7891891891891892, - "grad_norm": 3.0631520748138428, - "learning_rate": 4.924593827094743e-06, - "loss": 0.8759, - "step": 146 - }, - { - "epoch": 0.7945945945945946, - "grad_norm": 3.233267307281494, - "learning_rate": 4.923555506904176e-06, - "loss": 0.701, - "step": 147 - }, - { - "epoch": 0.8, - "grad_norm": 2.87701416015625, - "learning_rate": 4.922510197808321e-06, - "loss": 1.1327, - "step": 148 - }, - { - "epoch": 0.8054054054054054, - "grad_norm": 3.650576114654541, - "learning_rate": 4.921457902821578e-06, - "loss": 0.7587, - "step": 149 - }, - { - "epoch": 0.8108108108108109, - "grad_norm": 3.232112407684326, - "learning_rate": 4.920398624978493e-06, - "loss": 1.2158, - "step": 150 - }, - { - "epoch": 0.8162162162162162, - "grad_norm": 2.468384027481079, - "learning_rate": 4.919332367333748e-06, - "loss": 0.6852, - "step": 151 - }, - { - "epoch": 0.8216216216216217, - "grad_norm": 2.5947415828704834, - "learning_rate": 4.918259132962154e-06, - "loss": 0.6611, - "step": 152 - }, - { - "epoch": 0.827027027027027, - "grad_norm": 3.0171427726745605, - "learning_rate": 4.917178924958638e-06, - "loss": 0.7327, - "step": 153 - }, - { - "epoch": 0.8324324324324325, - "grad_norm": 3.293184518814087, - "learning_rate": 4.916091746438243e-06, - "loss": 0.8528, - "step": 154 - }, - { - "epoch": 0.8378378378378378, - "grad_norm": 4.0570969581604, - "learning_rate": 4.9149976005361085e-06, - "loss": 0.9141, - "step": 155 - }, - { - "epoch": 0.8432432432432433, - "grad_norm": 2.8782784938812256, - "learning_rate": 4.913896490407467e-06, - "loss": 1.1132, - "step": 156 - }, - { - "epoch": 0.8486486486486486, - "grad_norm": 2.5671517848968506, - "learning_rate": 4.912788419227635e-06, - "loss": 0.7587, - "step": 157 - }, - { - "epoch": 0.8540540540540541, - "grad_norm": 2.9445390701293945, - "learning_rate": 4.911673390192002e-06, - "loss": 0.9227, - "step": 158 - }, - { - "epoch": 0.8594594594594595, - "grad_norm": 2.472595453262329, - "learning_rate": 4.910551406516023e-06, - "loss": 0.8154, - "step": 159 - }, - { - "epoch": 0.8648648648648649, - "grad_norm": 2.5233397483825684, - "learning_rate": 4.909422471435207e-06, - "loss": 0.9897, - "step": 160 - }, - { - "epoch": 0.8702702702702703, - "grad_norm": 3.3919546604156494, - "learning_rate": 4.90828658820511e-06, - "loss": 0.6162, - "step": 161 - }, - { - "epoch": 0.8756756756756757, - "grad_norm": 3.060908555984497, - "learning_rate": 4.907143760101325e-06, - "loss": 0.5734, - "step": 162 - }, - { - "epoch": 0.8810810810810811, - "grad_norm": 3.4584782123565674, - "learning_rate": 4.905993990419472e-06, - "loss": 0.8328, - "step": 163 - }, - { - "epoch": 0.8864864864864865, - "grad_norm": 2.936570644378662, - "learning_rate": 4.904837282475187e-06, - "loss": 0.6787, - "step": 164 - }, - { - "epoch": 0.8918918918918919, - "grad_norm": 2.564837694168091, - "learning_rate": 4.9036736396041165e-06, - "loss": 0.9658, - "step": 165 - }, - { - "epoch": 0.8972972972972973, - "grad_norm": 3.2509360313415527, - "learning_rate": 4.902503065161905e-06, - "loss": 0.7899, - "step": 166 - }, - { - "epoch": 0.9027027027027027, - "grad_norm": 2.9730329513549805, - "learning_rate": 4.901325562524185e-06, - "loss": 0.9476, - "step": 167 - }, - { - "epoch": 0.9081081081081082, - "grad_norm": 3.044980049133301, - "learning_rate": 4.900141135086569e-06, - "loss": 0.7589, - "step": 168 - }, - { - "epoch": 0.9135135135135135, - "grad_norm": 3.030585527420044, - "learning_rate": 4.898949786264638e-06, - "loss": 0.6724, - "step": 169 - }, - { - "epoch": 0.918918918918919, - "grad_norm": 2.249122142791748, - "learning_rate": 4.897751519493933e-06, - "loss": 0.6968, - "step": 170 - }, - { - "epoch": 0.9243243243243243, - "grad_norm": 2.9816982746124268, - "learning_rate": 4.896546338229945e-06, - "loss": 0.7984, - "step": 171 - }, - { - "epoch": 0.9297297297297298, - "grad_norm": 2.415736675262451, - "learning_rate": 4.8953342459481034e-06, - "loss": 0.6109, - "step": 172 - }, - { - "epoch": 0.9351351351351351, - "grad_norm": 2.740518808364868, - "learning_rate": 4.894115246143768e-06, - "loss": 0.8126, - "step": 173 - }, - { - "epoch": 0.9405405405405406, - "grad_norm": 2.7610201835632324, - "learning_rate": 4.892889342332218e-06, - "loss": 0.6862, - "step": 174 - }, - { - "epoch": 0.9459459459459459, - "grad_norm": 3.057025194168091, - "learning_rate": 4.891656538048642e-06, - "loss": 0.9895, - "step": 175 - }, - { - "epoch": 0.9513513513513514, - "grad_norm": 2.569751262664795, - "learning_rate": 4.890416836848128e-06, - "loss": 0.8481, - "step": 176 - }, - { - "epoch": 0.9567567567567568, - "grad_norm": 2.4443397521972656, - "learning_rate": 4.889170242305652e-06, - "loss": 0.6478, - "step": 177 - }, - { - "epoch": 0.9621621621621622, - "grad_norm": 2.5009846687316895, - "learning_rate": 4.887916758016069e-06, - "loss": 0.9714, - "step": 178 - }, - { - "epoch": 0.9675675675675676, - "grad_norm": 3.101975202560425, - "learning_rate": 4.886656387594104e-06, - "loss": 1.1264, - "step": 179 - }, - { - "epoch": 0.972972972972973, - "grad_norm": 2.6144704818725586, - "learning_rate": 4.885389134674338e-06, - "loss": 0.7664, - "step": 180 - }, - { - "epoch": 0.9783783783783784, - "grad_norm": 2.5834381580352783, - "learning_rate": 4.884115002911197e-06, - "loss": 0.6131, - "step": 181 - }, - { - "epoch": 0.9837837837837838, - "grad_norm": 2.5378055572509766, - "learning_rate": 4.88283399597895e-06, - "loss": 0.8733, - "step": 182 - }, - { - "epoch": 0.9891891891891892, - "grad_norm": 2.4095377922058105, - "learning_rate": 4.881546117571686e-06, - "loss": 0.643, - "step": 183 - }, - { - "epoch": 0.9945945945945946, - "grad_norm": 2.9554507732391357, - "learning_rate": 4.8802513714033135e-06, - "loss": 0.7287, - "step": 184 - }, - { - "epoch": 1.0, - "grad_norm": 2.8279213905334473, - "learning_rate": 4.878949761207545e-06, - "loss": 0.9927, - "step": 185 - }, - { - "epoch": 1.0054054054054054, - "grad_norm": 2.9361412525177, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.66, - "step": 186 - }, - { - "epoch": 1.0108108108108107, - "grad_norm": 3.392244338989258, - "learning_rate": 4.876325963767623e-06, - "loss": 0.594, - "step": 187 - }, - { - "epoch": 1.0162162162162163, - "grad_norm": 2.6276044845581055, - "learning_rate": 4.875003784089822e-06, - "loss": 0.5825, - "step": 188 - }, - { - "epoch": 1.0216216216216216, - "grad_norm": 2.2875545024871826, - "learning_rate": 4.873674755517305e-06, - "loss": 0.6594, - "step": 189 - }, - { - "epoch": 1.027027027027027, - "grad_norm": 2.8086795806884766, - "learning_rate": 4.872338881882645e-06, - "loss": 0.7536, - "step": 190 - }, - { - "epoch": 1.0324324324324325, - "grad_norm": 2.3685200214385986, - "learning_rate": 4.870996167038154e-06, - "loss": 0.4849, - "step": 191 - }, - { - "epoch": 1.037837837837838, - "grad_norm": 3.0264766216278076, - "learning_rate": 4.869646614855877e-06, - "loss": 0.3771, - "step": 192 - }, - { - "epoch": 1.0432432432432432, - "grad_norm": 4.335122108459473, - "learning_rate": 4.868290229227567e-06, - "loss": 0.8545, - "step": 193 - }, - { - "epoch": 1.0486486486486486, - "grad_norm": 3.442172050476074, - "learning_rate": 4.866927014064692e-06, - "loss": 0.3698, - "step": 194 - }, - { - "epoch": 1.054054054054054, - "grad_norm": 3.326539993286133, - "learning_rate": 4.86555697329841e-06, - "loss": 0.8468, - "step": 195 - }, - { - "epoch": 1.0594594594594595, - "grad_norm": 3.0372447967529297, - "learning_rate": 4.864180110879562e-06, - "loss": 0.8232, - "step": 196 - }, - { - "epoch": 1.0648648648648649, - "grad_norm": 2.955343008041382, - "learning_rate": 4.862796430778663e-06, - "loss": 0.4097, - "step": 197 - }, - { - "epoch": 1.0702702702702702, - "grad_norm": 2.4095399379730225, - "learning_rate": 4.861405936985889e-06, - "loss": 0.6746, - "step": 198 - }, - { - "epoch": 1.0756756756756758, - "grad_norm": 2.763500452041626, - "learning_rate": 4.860008633511059e-06, - "loss": 0.6605, - "step": 199 - }, - { - "epoch": 1.0810810810810811, - "grad_norm": 2.6751155853271484, - "learning_rate": 4.8586045243836384e-06, - "loss": 0.471, - "step": 200 - }, - { - "epoch": 1.0864864864864865, - "grad_norm": 3.3507862091064453, - "learning_rate": 4.857193613652711e-06, - "loss": 0.7665, - "step": 201 - }, - { - "epoch": 1.0918918918918918, - "grad_norm": 3.3064827919006348, - "learning_rate": 4.8557759053869775e-06, - "loss": 0.6436, - "step": 202 - }, - { - "epoch": 1.0972972972972972, - "grad_norm": 2.571828603744507, - "learning_rate": 4.854351403674741e-06, - "loss": 0.4642, - "step": 203 - }, - { - "epoch": 1.1027027027027028, - "grad_norm": 2.883220911026001, - "learning_rate": 4.852920112623895e-06, - "loss": 0.5737, - "step": 204 - }, - { - "epoch": 1.1081081081081081, - "grad_norm": 3.026144027709961, - "learning_rate": 4.851482036361912e-06, - "loss": 0.7302, - "step": 205 - }, - { - "epoch": 1.1135135135135135, - "grad_norm": 2.6689612865448, - "learning_rate": 4.850037179035829e-06, - "loss": 0.5229, - "step": 206 - }, - { - "epoch": 1.118918918918919, - "grad_norm": 2.4019956588745117, - "learning_rate": 4.8485855448122425e-06, - "loss": 0.5529, - "step": 207 - }, - { - "epoch": 1.1243243243243244, - "grad_norm": 2.3546230792999268, - "learning_rate": 4.847127137877286e-06, - "loss": 0.3635, - "step": 208 - }, - { - "epoch": 1.1297297297297297, - "grad_norm": 2.999096393585205, - "learning_rate": 4.8456619624366285e-06, - "loss": 0.8149, - "step": 209 - }, - { - "epoch": 1.135135135135135, - "grad_norm": 10.072900772094727, - "learning_rate": 4.844190022715456e-06, - "loss": 0.8333, - "step": 210 - }, - { - "epoch": 1.1405405405405404, - "grad_norm": 2.222123384475708, - "learning_rate": 4.84271132295846e-06, - "loss": 0.3717, - "step": 211 - }, - { - "epoch": 1.145945945945946, - "grad_norm": 2.8751113414764404, - "learning_rate": 4.841225867429826e-06, - "loss": 0.5994, - "step": 212 - }, - { - "epoch": 1.1513513513513514, - "grad_norm": 2.9580111503601074, - "learning_rate": 4.839733660413224e-06, - "loss": 0.8382, - "step": 213 - }, - { - "epoch": 1.1567567567567567, - "grad_norm": 4.628892421722412, - "learning_rate": 4.838234706211792e-06, - "loss": 0.818, - "step": 214 - }, - { - "epoch": 1.1621621621621623, - "grad_norm": 2.5103509426116943, - "learning_rate": 4.836729009148124e-06, - "loss": 0.4267, - "step": 215 - }, - { - "epoch": 1.1675675675675676, - "grad_norm": 2.6093738079071045, - "learning_rate": 4.835216573564261e-06, - "loss": 0.3472, - "step": 216 - }, - { - "epoch": 1.172972972972973, - "grad_norm": 3.0792338848114014, - "learning_rate": 4.833697403821672e-06, - "loss": 0.6323, - "step": 217 - }, - { - "epoch": 1.1783783783783783, - "grad_norm": 2.845163345336914, - "learning_rate": 4.8321715043012516e-06, - "loss": 0.6831, - "step": 218 - }, - { - "epoch": 1.1837837837837837, - "grad_norm": 3.0433948040008545, - "learning_rate": 4.830638879403296e-06, - "loss": 0.3682, - "step": 219 - }, - { - "epoch": 1.1891891891891893, - "grad_norm": 2.6533594131469727, - "learning_rate": 4.8290995335475e-06, - "loss": 0.4154, - "step": 220 - }, - { - "epoch": 1.1945945945945946, - "grad_norm": 2.9271352291107178, - "learning_rate": 4.827553471172935e-06, - "loss": 0.3991, - "step": 221 - }, - { - "epoch": 1.2, - "grad_norm": 2.9243528842926025, - "learning_rate": 4.826000696738045e-06, - "loss": 0.4538, - "step": 222 - }, - { - "epoch": 1.2054054054054055, - "grad_norm": 2.537332534790039, - "learning_rate": 4.824441214720629e-06, - "loss": 0.7692, - "step": 223 - }, - { - "epoch": 1.2108108108108109, - "grad_norm": 3.9193246364593506, - "learning_rate": 4.8228750296178275e-06, - "loss": 0.6038, - "step": 224 - }, - { - "epoch": 1.2162162162162162, - "grad_norm": 2.6646728515625, - "learning_rate": 4.821302145946113e-06, - "loss": 0.4147, - "step": 225 - }, - { - "epoch": 1.2216216216216216, - "grad_norm": 2.6519482135772705, - "learning_rate": 4.819722568241274e-06, - "loss": 0.5398, - "step": 226 - }, - { - "epoch": 1.227027027027027, - "grad_norm": 2.2018048763275146, - "learning_rate": 4.818136301058401e-06, - "loss": 0.3864, - "step": 227 - }, - { - "epoch": 1.2324324324324325, - "grad_norm": 2.5660712718963623, - "learning_rate": 4.816543348971879e-06, - "loss": 0.5712, - "step": 228 - }, - { - "epoch": 1.2378378378378379, - "grad_norm": 3.237663745880127, - "learning_rate": 4.814943716575368e-06, - "loss": 0.662, - "step": 229 - }, - { - "epoch": 1.2432432432432432, - "grad_norm": 2.5570430755615234, - "learning_rate": 4.813337408481793e-06, - "loss": 0.8661, - "step": 230 - }, - { - "epoch": 1.2486486486486488, - "grad_norm": 2.9231269359588623, - "learning_rate": 4.811724429323329e-06, - "loss": 0.9218, - "step": 231 - }, - { - "epoch": 1.2540540540540541, - "grad_norm": 3.637084722518921, - "learning_rate": 4.810104783751389e-06, - "loss": 0.5597, - "step": 232 - }, - { - "epoch": 1.2594594594594595, - "grad_norm": 3.0218842029571533, - "learning_rate": 4.8084784764366125e-06, - "loss": 0.4786, - "step": 233 - }, - { - "epoch": 1.2648648648648648, - "grad_norm": 2.770214080810547, - "learning_rate": 4.806845512068846e-06, - "loss": 0.5219, - "step": 234 - }, - { - "epoch": 1.2702702702702702, - "grad_norm": 3.093053102493286, - "learning_rate": 4.805205895357137e-06, - "loss": 0.643, - "step": 235 - }, - { - "epoch": 1.2756756756756757, - "grad_norm": 2.6373348236083984, - "learning_rate": 4.803559631029713e-06, - "loss": 0.5858, - "step": 236 - }, - { - "epoch": 1.281081081081081, - "grad_norm": 2.452030897140503, - "learning_rate": 4.801906723833973e-06, - "loss": 0.4185, - "step": 237 - }, - { - "epoch": 1.2864864864864864, - "grad_norm": 2.72564697265625, - "learning_rate": 4.8002471785364734e-06, - "loss": 0.4917, - "step": 238 - }, - { - "epoch": 1.291891891891892, - "grad_norm": 3.0389158725738525, - "learning_rate": 4.798580999922913e-06, - "loss": 0.645, - "step": 239 - }, - { - "epoch": 1.2972972972972974, - "grad_norm": 3.7002289295196533, - "learning_rate": 4.796908192798117e-06, - "loss": 0.5378, - "step": 240 - }, - { - "epoch": 1.3027027027027027, - "grad_norm": 2.1876111030578613, - "learning_rate": 4.7952287619860276e-06, - "loss": 0.5197, - "step": 241 - }, - { - "epoch": 1.308108108108108, - "grad_norm": 3.903337240219116, - "learning_rate": 4.793542712329689e-06, - "loss": 1.0226, - "step": 242 - }, - { - "epoch": 1.3135135135135134, - "grad_norm": 2.3623552322387695, - "learning_rate": 4.791850048691228e-06, - "loss": 0.5502, - "step": 243 - }, - { - "epoch": 1.318918918918919, - "grad_norm": 3.0669031143188477, - "learning_rate": 4.79015077595185e-06, - "loss": 0.6976, - "step": 244 - }, - { - "epoch": 1.3243243243243243, - "grad_norm": 3.1480472087860107, - "learning_rate": 4.788444899011816e-06, - "loss": 0.4795, - "step": 245 - }, - { - "epoch": 1.3297297297297297, - "grad_norm": 3.7051920890808105, - "learning_rate": 4.786732422790432e-06, - "loss": 0.6526, - "step": 246 - }, - { - "epoch": 1.3351351351351353, - "grad_norm": 3.4358389377593994, - "learning_rate": 4.785013352226036e-06, - "loss": 0.5551, - "step": 247 - }, - { - "epoch": 1.3405405405405406, - "grad_norm": 2.3789355754852295, - "learning_rate": 4.7832876922759805e-06, - "loss": 0.3151, - "step": 248 - }, - { - "epoch": 1.345945945945946, - "grad_norm": 2.4843716621398926, - "learning_rate": 4.781555447916622e-06, - "loss": 0.6713, - "step": 249 - }, - { - "epoch": 1.3513513513513513, - "grad_norm": 3.0176303386688232, - "learning_rate": 4.779816624143302e-06, - "loss": 0.437, - "step": 250 - }, - { - "epoch": 1.3567567567567567, - "grad_norm": 2.868350028991699, - "learning_rate": 4.77807122597034e-06, - "loss": 0.7632, - "step": 251 - }, - { - "epoch": 1.3621621621621622, - "grad_norm": 2.4629738330841064, - "learning_rate": 4.776319258431009e-06, - "loss": 0.4894, - "step": 252 - }, - { - "epoch": 1.3675675675675676, - "grad_norm": 2.798297882080078, - "learning_rate": 4.77456072657753e-06, - "loss": 0.4456, - "step": 253 - }, - { - "epoch": 1.372972972972973, - "grad_norm": 3.2977547645568848, - "learning_rate": 4.772795635481053e-06, - "loss": 0.5381, - "step": 254 - }, - { - "epoch": 1.3783783783783785, - "grad_norm": 4.1061906814575195, - "learning_rate": 4.77102399023164e-06, - "loss": 1.0302, - "step": 255 - }, - { - "epoch": 1.3837837837837839, - "grad_norm": 3.943284511566162, - "learning_rate": 4.769245795938261e-06, - "loss": 0.4875, - "step": 256 - }, - { - "epoch": 1.3891891891891892, - "grad_norm": 2.6420533657073975, - "learning_rate": 4.767461057728763e-06, - "loss": 0.4923, - "step": 257 - }, - { - "epoch": 1.3945945945945946, - "grad_norm": 3.3152263164520264, - "learning_rate": 4.76566978074987e-06, - "loss": 0.6699, - "step": 258 - }, - { - "epoch": 1.4, - "grad_norm": 2.6928882598876953, - "learning_rate": 4.7638719701671586e-06, - "loss": 0.6117, - "step": 259 - }, - { - "epoch": 1.4054054054054055, - "grad_norm": 2.706597328186035, - "learning_rate": 4.762067631165049e-06, - "loss": 0.8534, - "step": 260 - }, - { - "epoch": 1.4108108108108108, - "grad_norm": 2.9912848472595215, - "learning_rate": 4.760256768946787e-06, - "loss": 0.5057, - "step": 261 - }, - { - "epoch": 1.4162162162162162, - "grad_norm": 2.7098443508148193, - "learning_rate": 4.758439388734429e-06, - "loss": 0.7286, - "step": 262 - }, - { - "epoch": 1.4216216216216218, - "grad_norm": 3.1288092136383057, - "learning_rate": 4.7566154957688276e-06, - "loss": 0.9827, - "step": 263 - }, - { - "epoch": 1.427027027027027, - "grad_norm": 3.0505919456481934, - "learning_rate": 4.754785095309617e-06, - "loss": 0.7042, - "step": 264 - }, - { - "epoch": 1.4324324324324325, - "grad_norm": 2.6800339221954346, - "learning_rate": 4.752948192635199e-06, - "loss": 0.5179, - "step": 265 - }, - { - "epoch": 1.4378378378378378, - "grad_norm": 2.2246861457824707, - "learning_rate": 4.751104793042722e-06, - "loss": 0.8527, - "step": 266 - }, - { - "epoch": 1.4432432432432432, - "grad_norm": 2.4242751598358154, - "learning_rate": 4.7492549018480725e-06, - "loss": 0.5627, - "step": 267 - }, - { - "epoch": 1.4486486486486487, - "grad_norm": 2.763244152069092, - "learning_rate": 4.747398524385858e-06, - "loss": 0.8981, - "step": 268 - }, - { - "epoch": 1.454054054054054, - "grad_norm": 2.856595993041992, - "learning_rate": 4.745535666009389e-06, - "loss": 0.5455, - "step": 269 - }, - { - "epoch": 1.4594594594594594, - "grad_norm": 2.4168624877929688, - "learning_rate": 4.743666332090664e-06, - "loss": 0.4348, - "step": 270 - }, - { - "epoch": 1.464864864864865, - "grad_norm": 2.5408060550689697, - "learning_rate": 4.74179052802036e-06, - "loss": 0.5524, - "step": 271 - }, - { - "epoch": 1.4702702702702704, - "grad_norm": 2.6216673851013184, - "learning_rate": 4.739908259207807e-06, - "loss": 0.7469, - "step": 272 - }, - { - "epoch": 1.4756756756756757, - "grad_norm": 5.397300720214844, - "learning_rate": 4.738019531080981e-06, - "loss": 0.7216, - "step": 273 - }, - { - "epoch": 1.481081081081081, - "grad_norm": 3.3481080532073975, - "learning_rate": 4.7361243490864825e-06, - "loss": 0.7527, - "step": 274 - }, - { - "epoch": 1.4864864864864864, - "grad_norm": 2.7943873405456543, - "learning_rate": 4.734222718689527e-06, - "loss": 0.7437, - "step": 275 - }, - { - "epoch": 1.491891891891892, - "grad_norm": 2.206890344619751, - "learning_rate": 4.732314645373922e-06, - "loss": 0.5187, - "step": 276 - }, - { - "epoch": 1.4972972972972973, - "grad_norm": 2.76442813873291, - "learning_rate": 4.730400134642055e-06, - "loss": 0.7186, - "step": 277 - }, - { - "epoch": 1.5027027027027027, - "grad_norm": 3.4754087924957275, - "learning_rate": 4.728479192014879e-06, - "loss": 0.9655, - "step": 278 - }, - { - "epoch": 1.5081081081081082, - "grad_norm": 2.923779249191284, - "learning_rate": 4.726551823031895e-06, - "loss": 0.6251, - "step": 279 - }, - { - "epoch": 1.5135135135135136, - "grad_norm": 3.1142773628234863, - "learning_rate": 4.7246180332511335e-06, - "loss": 0.4805, - "step": 280 - }, - { - "epoch": 1.518918918918919, - "grad_norm": 2.3477070331573486, - "learning_rate": 4.722677828249142e-06, - "loss": 1.0939, - "step": 281 - }, - { - "epoch": 1.5243243243243243, - "grad_norm": 2.8418569564819336, - "learning_rate": 4.720731213620972e-06, - "loss": 0.9485, - "step": 282 - }, - { - "epoch": 1.5297297297297296, - "grad_norm": 2.462710380554199, - "learning_rate": 4.718778194980152e-06, - "loss": 0.5805, - "step": 283 - }, - { - "epoch": 1.535135135135135, - "grad_norm": 3.2379209995269775, - "learning_rate": 4.7168187779586805e-06, - "loss": 0.77, - "step": 284 - }, - { - "epoch": 1.5405405405405406, - "grad_norm": 3.0701661109924316, - "learning_rate": 4.71485296820701e-06, - "loss": 0.5932, - "step": 285 - }, - { - "epoch": 1.545945945945946, - "grad_norm": 4.099547386169434, - "learning_rate": 4.7128807713940245e-06, - "loss": 0.6296, - "step": 286 - }, - { - "epoch": 1.5513513513513515, - "grad_norm": 2.5529167652130127, - "learning_rate": 4.710902193207028e-06, - "loss": 0.6201, - "step": 287 - }, - { - "epoch": 1.5567567567567568, - "grad_norm": 2.794926881790161, - "learning_rate": 4.708917239351727e-06, - "loss": 0.5682, - "step": 288 - }, - { - "epoch": 1.5621621621621622, - "grad_norm": 3.2522501945495605, - "learning_rate": 4.706925915552214e-06, - "loss": 0.8877, - "step": 289 - }, - { - "epoch": 1.5675675675675675, - "grad_norm": 2.811847448348999, - "learning_rate": 4.704928227550949e-06, - "loss": 0.6521, - "step": 290 - }, - { - "epoch": 1.572972972972973, - "grad_norm": 2.7060673236846924, - "learning_rate": 4.702924181108745e-06, - "loss": 0.4929, - "step": 291 - }, - { - "epoch": 1.5783783783783782, - "grad_norm": 2.5009031295776367, - "learning_rate": 4.700913782004755e-06, - "loss": 0.4515, - "step": 292 - }, - { - "epoch": 1.5837837837837838, - "grad_norm": 2.6722700595855713, - "learning_rate": 4.698897036036446e-06, - "loss": 0.5477, - "step": 293 - }, - { - "epoch": 1.5891891891891892, - "grad_norm": 3.3333957195281982, - "learning_rate": 4.696873949019591e-06, - "loss": 0.9589, - "step": 294 - }, - { - "epoch": 1.5945945945945947, - "grad_norm": 2.4862897396087646, - "learning_rate": 4.694844526788248e-06, - "loss": 0.4425, - "step": 295 - }, - { - "epoch": 1.6, - "grad_norm": 2.78708553314209, - "learning_rate": 4.692808775194745e-06, - "loss": 0.4899, - "step": 296 - }, - { - "epoch": 1.6054054054054054, - "grad_norm": 2.9121289253234863, - "learning_rate": 4.690766700109659e-06, - "loss": 0.4884, - "step": 297 - }, - { - "epoch": 1.6108108108108108, - "grad_norm": 4.692054271697998, - "learning_rate": 4.688718307421807e-06, - "loss": 0.8977, - "step": 298 - }, - { - "epoch": 1.6162162162162161, - "grad_norm": 3.1290926933288574, - "learning_rate": 4.686663603038222e-06, - "loss": 0.6833, - "step": 299 - }, - { - "epoch": 1.6216216216216215, - "grad_norm": 3.5091123580932617, - "learning_rate": 4.6846025928841365e-06, - "loss": 0.9141, - "step": 300 - }, - { - "epoch": 1.627027027027027, - "grad_norm": 2.5466184616088867, - "learning_rate": 4.6825352829029705e-06, - "loss": 0.5121, - "step": 301 - }, - { - "epoch": 1.6324324324324324, - "grad_norm": 2.7833092212677, - "learning_rate": 4.68046167905631e-06, - "loss": 0.5399, - "step": 302 - }, - { - "epoch": 1.637837837837838, - "grad_norm": 3.05135440826416, - "learning_rate": 4.678381787323889e-06, - "loss": 0.7921, - "step": 303 - }, - { - "epoch": 1.6432432432432433, - "grad_norm": 2.2391726970672607, - "learning_rate": 4.676295613703577e-06, - "loss": 0.7178, - "step": 304 - }, - { - "epoch": 1.6486486486486487, - "grad_norm": 2.3654022216796875, - "learning_rate": 4.674203164211357e-06, - "loss": 0.7162, - "step": 305 - }, - { - "epoch": 1.654054054054054, - "grad_norm": 2.436009645462036, - "learning_rate": 4.67210444488131e-06, - "loss": 0.6539, - "step": 306 - }, - { - "epoch": 1.6594594594594594, - "grad_norm": 2.6034209728240967, - "learning_rate": 4.669999461765599e-06, - "loss": 0.7214, - "step": 307 - }, - { - "epoch": 1.6648648648648647, - "grad_norm": 2.804229497909546, - "learning_rate": 4.6678882209344474e-06, - "loss": 0.7451, - "step": 308 - }, - { - "epoch": 1.6702702702702703, - "grad_norm": 2.6239655017852783, - "learning_rate": 4.665770728476127e-06, - "loss": 0.6464, - "step": 309 - }, - { - "epoch": 1.6756756756756757, - "grad_norm": 2.9320099353790283, - "learning_rate": 4.663646990496939e-06, - "loss": 0.6669, - "step": 310 - }, - { - "epoch": 1.6810810810810812, - "grad_norm": 3.09713077545166, - "learning_rate": 4.661517013121189e-06, - "loss": 0.8972, - "step": 311 - }, - { - "epoch": 1.6864864864864866, - "grad_norm": 3.6576132774353027, - "learning_rate": 4.659380802491181e-06, - "loss": 0.6286, - "step": 312 - }, - { - "epoch": 1.691891891891892, - "grad_norm": 2.9320433139801025, - "learning_rate": 4.6572383647671915e-06, - "loss": 0.3631, - "step": 313 - }, - { - "epoch": 1.6972972972972973, - "grad_norm": 3.399357557296753, - "learning_rate": 4.655089706127457e-06, - "loss": 0.5682, - "step": 314 - }, - { - "epoch": 1.7027027027027026, - "grad_norm": 2.7667412757873535, - "learning_rate": 4.652934832768148e-06, - "loss": 0.5457, - "step": 315 - }, - { - "epoch": 1.708108108108108, - "grad_norm": 2.3023321628570557, - "learning_rate": 4.650773750903363e-06, - "loss": 0.6601, - "step": 316 - }, - { - "epoch": 1.7135135135135136, - "grad_norm": 2.6584670543670654, - "learning_rate": 4.6486064667651005e-06, - "loss": 0.5882, - "step": 317 - }, - { - "epoch": 1.718918918918919, - "grad_norm": 5.528168678283691, - "learning_rate": 4.646432986603245e-06, - "loss": 0.7628, - "step": 318 - }, - { - "epoch": 1.7243243243243245, - "grad_norm": 3.054884195327759, - "learning_rate": 4.644253316685552e-06, - "loss": 0.6877, - "step": 319 - }, - { - "epoch": 1.7297297297297298, - "grad_norm": 3.2672388553619385, - "learning_rate": 4.6420674632976205e-06, - "loss": 0.7026, - "step": 320 - }, - { - "epoch": 1.7351351351351352, - "grad_norm": 3.109384536743164, - "learning_rate": 4.639875432742886e-06, - "loss": 0.5236, - "step": 321 - }, - { - "epoch": 1.7405405405405405, - "grad_norm": 3.3593883514404297, - "learning_rate": 4.6376772313425975e-06, - "loss": 0.6463, - "step": 322 - }, - { - "epoch": 1.7459459459459459, - "grad_norm": 2.6352698802948, - "learning_rate": 4.635472865435795e-06, - "loss": 0.6903, - "step": 323 - }, - { - "epoch": 1.7513513513513512, - "grad_norm": 2.751690149307251, - "learning_rate": 4.6332623413792995e-06, - "loss": 0.7342, - "step": 324 - }, - { - "epoch": 1.7567567567567568, - "grad_norm": 2.670915126800537, - "learning_rate": 4.6310456655476874e-06, - "loss": 0.4302, - "step": 325 - }, - { - "epoch": 1.7621621621621621, - "grad_norm": 2.7648138999938965, - "learning_rate": 4.6288228443332786e-06, - "loss": 0.5108, - "step": 326 - }, - { - "epoch": 1.7675675675675677, - "grad_norm": 2.7451536655426025, - "learning_rate": 4.626593884146111e-06, - "loss": 0.7646, - "step": 327 - }, - { - "epoch": 1.772972972972973, - "grad_norm": 2.4656403064727783, - "learning_rate": 4.624358791413928e-06, - "loss": 0.5529, - "step": 328 - }, - { - "epoch": 1.7783783783783784, - "grad_norm": 2.5987517833709717, - "learning_rate": 4.622117572582159e-06, - "loss": 0.609, - "step": 329 - }, - { - "epoch": 1.7837837837837838, - "grad_norm": 3.3843371868133545, - "learning_rate": 4.619870234113894e-06, - "loss": 0.9146, - "step": 330 - }, - { - "epoch": 1.7891891891891891, - "grad_norm": 2.3542068004608154, - "learning_rate": 4.617616782489878e-06, - "loss": 0.6887, - "step": 331 - }, - { - "epoch": 1.7945945945945945, - "grad_norm": 2.2049715518951416, - "learning_rate": 4.615357224208477e-06, - "loss": 0.505, - "step": 332 - }, - { - "epoch": 1.8, - "grad_norm": 2.453920364379883, - "learning_rate": 4.613091565785674e-06, - "loss": 0.8384, - "step": 333 - }, - { - "epoch": 1.8054054054054054, - "grad_norm": 2.5751583576202393, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5512, - "step": 334 - }, - { - "epoch": 1.810810810810811, - "grad_norm": 2.524075984954834, - "learning_rate": 4.608541974667714e-06, - "loss": 0.4877, - "step": 335 - }, - { - "epoch": 1.8162162162162163, - "grad_norm": 2.2856955528259277, - "learning_rate": 4.606258055092397e-06, - "loss": 0.5583, - "step": 336 - }, - { - "epoch": 1.8216216216216217, - "grad_norm": 2.2773683071136475, - "learning_rate": 4.603968061615321e-06, - "loss": 0.5421, - "step": 337 - }, - { - "epoch": 1.827027027027027, - "grad_norm": 4.085512161254883, - "learning_rate": 4.601672000840231e-06, - "loss": 0.942, - "step": 338 - }, - { - "epoch": 1.8324324324324324, - "grad_norm": 2.3710968494415283, - "learning_rate": 4.5993698793883715e-06, - "loss": 0.3773, - "step": 339 - }, - { - "epoch": 1.8378378378378377, - "grad_norm": 2.745534658432007, - "learning_rate": 4.597061703898462e-06, - "loss": 0.9694, - "step": 340 - }, - { - "epoch": 1.8432432432432433, - "grad_norm": 2.463207244873047, - "learning_rate": 4.594747481026685e-06, - "loss": 0.4667, - "step": 341 - }, - { - "epoch": 1.8486486486486486, - "grad_norm": 2.7216601371765137, - "learning_rate": 4.592427217446656e-06, - "loss": 0.4267, - "step": 342 - }, - { - "epoch": 1.8540540540540542, - "grad_norm": 2.545664072036743, - "learning_rate": 4.590100919849413e-06, - "loss": 0.9245, - "step": 343 - }, - { - "epoch": 1.8594594594594596, - "grad_norm": 3.692840337753296, - "learning_rate": 4.587768594943396e-06, - "loss": 0.7502, - "step": 344 - }, - { - "epoch": 1.864864864864865, - "grad_norm": 2.993229627609253, - "learning_rate": 4.585430249454426e-06, - "loss": 0.4689, - "step": 345 - }, - { - "epoch": 1.8702702702702703, - "grad_norm": 2.162867546081543, - "learning_rate": 4.583085890125682e-06, - "loss": 0.6188, - "step": 346 - }, - { - "epoch": 1.8756756756756756, - "grad_norm": 2.2169792652130127, - "learning_rate": 4.5807355237176896e-06, - "loss": 0.6352, - "step": 347 - }, - { - "epoch": 1.881081081081081, - "grad_norm": 3.978985548019409, - "learning_rate": 4.578379157008296e-06, - "loss": 0.464, - "step": 348 - }, - { - "epoch": 1.8864864864864865, - "grad_norm": 2.236682653427124, - "learning_rate": 4.57601679679265e-06, - "loss": 0.5943, - "step": 349 - }, - { - "epoch": 1.8918918918918919, - "grad_norm": 2.528754472732544, - "learning_rate": 4.573648449883188e-06, - "loss": 0.6949, - "step": 350 - }, - { - "epoch": 1.8972972972972975, - "grad_norm": 2.7673721313476562, - "learning_rate": 4.571274123109606e-06, - "loss": 0.4333, - "step": 351 - }, - { - "epoch": 1.9027027027027028, - "grad_norm": 2.698012351989746, - "learning_rate": 4.568893823318847e-06, - "loss": 0.6796, - "step": 352 - }, - { - "epoch": 1.9081081081081082, - "grad_norm": 2.9640560150146484, - "learning_rate": 4.566507557375077e-06, - "loss": 0.6139, - "step": 353 - }, - { - "epoch": 1.9135135135135135, - "grad_norm": 2.417628526687622, - "learning_rate": 4.5641153321596684e-06, - "loss": 0.4515, - "step": 354 - }, - { - "epoch": 1.9189189189189189, - "grad_norm": 2.676739454269409, - "learning_rate": 4.56171715457118e-06, - "loss": 0.8426, - "step": 355 - }, - { - "epoch": 1.9243243243243242, - "grad_norm": 2.8428189754486084, - "learning_rate": 4.559313031525331e-06, - "loss": 0.5806, - "step": 356 - }, - { - "epoch": 1.9297297297297298, - "grad_norm": 2.6817944049835205, - "learning_rate": 4.55690296995499e-06, - "loss": 0.5927, - "step": 357 - }, - { - "epoch": 1.9351351351351351, - "grad_norm": 3.5939931869506836, - "learning_rate": 4.554486976810149e-06, - "loss": 0.9986, - "step": 358 - }, - { - "epoch": 1.9405405405405407, - "grad_norm": 2.86688494682312, - "learning_rate": 4.552065059057906e-06, - "loss": 0.6813, - "step": 359 - }, - { - "epoch": 1.945945945945946, - "grad_norm": 2.9295246601104736, - "learning_rate": 4.549637223682441e-06, - "loss": 1.0832, - "step": 360 - }, - { - "epoch": 1.9513513513513514, - "grad_norm": 2.6939451694488525, - "learning_rate": 4.547203477685005e-06, - "loss": 0.7377, - "step": 361 - }, - { - "epoch": 1.9567567567567568, - "grad_norm": 2.226055145263672, - "learning_rate": 4.544763828083888e-06, - "loss": 0.5412, - "step": 362 - }, - { - "epoch": 1.962162162162162, - "grad_norm": 2.490187406539917, - "learning_rate": 4.542318281914405e-06, - "loss": 0.6955, - "step": 363 - }, - { - "epoch": 1.9675675675675675, - "grad_norm": 2.9241302013397217, - "learning_rate": 4.53986684622888e-06, - "loss": 0.6774, - "step": 364 - }, - { - "epoch": 1.972972972972973, - "grad_norm": 2.988084554672241, - "learning_rate": 4.537409528096615e-06, - "loss": 0.5832, - "step": 365 - }, - { - "epoch": 1.9783783783783784, - "grad_norm": 2.9380626678466797, - "learning_rate": 4.534946334603879e-06, - "loss": 0.606, - "step": 366 - }, - { - "epoch": 1.983783783783784, - "grad_norm": 2.667588710784912, - "learning_rate": 4.532477272853882e-06, - "loss": 0.4991, - "step": 367 - }, - { - "epoch": 1.9891891891891893, - "grad_norm": 2.9711899757385254, - "learning_rate": 4.530002349966759e-06, - "loss": 0.4442, - "step": 368 - }, - { - "epoch": 1.9945945945945946, - "grad_norm": 3.443957805633545, - "learning_rate": 4.5275215730795445e-06, - "loss": 0.6566, - "step": 369 - }, - { - "epoch": 2.0, - "grad_norm": 3.590317487716675, - "learning_rate": 4.525034949346156e-06, - "loss": 0.5687, - "step": 370 - }, - { - "epoch": 2.0054054054054054, - "grad_norm": 3.678600549697876, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4458, - "step": 371 - }, - { - "epoch": 2.0108108108108107, - "grad_norm": 3.803563356399536, - "learning_rate": 4.5200441900408045e-06, - "loss": 0.4418, - "step": 372 - }, - { - "epoch": 2.016216216216216, - "grad_norm": 2.9187233448028564, - "learning_rate": 4.517540068860898e-06, - "loss": 0.7057, - "step": 373 - }, - { - "epoch": 2.0216216216216214, - "grad_norm": 2.693603515625, - "learning_rate": 4.515030129618884e-06, - "loss": 0.4491, - "step": 374 - }, - { - "epoch": 2.027027027027027, - "grad_norm": 2.3883047103881836, - "learning_rate": 4.512514379552779e-06, - "loss": 0.3571, - "step": 375 - }, - { - "epoch": 2.0324324324324325, - "grad_norm": 4.558557033538818, - "learning_rate": 4.509992825917352e-06, - "loss": 0.5056, - "step": 376 - }, - { - "epoch": 2.037837837837838, - "grad_norm": 3.9574761390686035, - "learning_rate": 4.507465475984109e-06, - "loss": 0.6834, - "step": 377 - }, - { - "epoch": 2.0432432432432432, - "grad_norm": 5.34630012512207, - "learning_rate": 4.504932337041272e-06, - "loss": 0.6726, - "step": 378 - }, - { - "epoch": 2.0486486486486486, - "grad_norm": 3.198740243911743, - "learning_rate": 4.502393416393757e-06, - "loss": 0.4032, - "step": 379 - }, - { - "epoch": 2.054054054054054, - "grad_norm": 3.347480297088623, - "learning_rate": 4.4998487213631515e-06, - "loss": 0.5442, - "step": 380 - }, - { - "epoch": 2.0594594594594593, - "grad_norm": 3.940531015396118, - "learning_rate": 4.497298259287696e-06, - "loss": 0.6181, - "step": 381 - }, - { - "epoch": 2.064864864864865, - "grad_norm": 3.0910496711730957, - "learning_rate": 4.494742037522261e-06, - "loss": 0.3829, - "step": 382 - }, - { - "epoch": 2.0702702702702704, - "grad_norm": 4.060451984405518, - "learning_rate": 4.4921800634383295e-06, - "loss": 0.4953, - "step": 383 - }, - { - "epoch": 2.075675675675676, - "grad_norm": 3.1667511463165283, - "learning_rate": 4.4896123444239655e-06, - "loss": 0.3254, - "step": 384 - }, - { - "epoch": 2.081081081081081, - "grad_norm": 3.0239670276641846, - "learning_rate": 4.487038887883809e-06, - "loss": 0.555, - "step": 385 - }, - { - "epoch": 2.0864864864864865, - "grad_norm": 2.8815383911132812, - "learning_rate": 4.484459701239038e-06, - "loss": 0.665, - "step": 386 - }, - { - "epoch": 2.091891891891892, - "grad_norm": 3.615537166595459, - "learning_rate": 4.481874791927358e-06, - "loss": 0.2652, - "step": 387 - }, - { - "epoch": 2.097297297297297, - "grad_norm": 3.407407283782959, - "learning_rate": 4.479284167402977e-06, - "loss": 0.3811, - "step": 388 - }, - { - "epoch": 2.1027027027027025, - "grad_norm": 2.6651623249053955, - "learning_rate": 4.476687835136585e-06, - "loss": 0.2463, - "step": 389 - }, - { - "epoch": 2.108108108108108, - "grad_norm": 3.5145862102508545, - "learning_rate": 4.47408580261533e-06, - "loss": 0.5507, - "step": 390 - }, - { - "epoch": 2.1135135135135137, - "grad_norm": 3.0952725410461426, - "learning_rate": 4.471478077342798e-06, - "loss": 0.288, - "step": 391 - }, - { - "epoch": 2.118918918918919, - "grad_norm": 2.634775400161743, - "learning_rate": 4.468864666838994e-06, - "loss": 0.5169, - "step": 392 - }, - { - "epoch": 2.1243243243243244, - "grad_norm": 3.7388594150543213, - "learning_rate": 4.4662455786403125e-06, - "loss": 0.3327, - "step": 393 - }, - { - "epoch": 2.1297297297297297, - "grad_norm": 3.8197360038757324, - "learning_rate": 4.463620820299528e-06, - "loss": 0.3877, - "step": 394 - }, - { - "epoch": 2.135135135135135, - "grad_norm": 3.0073485374450684, - "learning_rate": 4.4609903993857606e-06, - "loss": 0.5425, - "step": 395 - }, - { - "epoch": 2.1405405405405404, - "grad_norm": 2.6923868656158447, - "learning_rate": 4.458354323484462e-06, - "loss": 0.5257, - "step": 396 - }, - { - "epoch": 2.145945945945946, - "grad_norm": 3.2151331901550293, - "learning_rate": 4.45571260019739e-06, - "loss": 0.3914, - "step": 397 - }, - { - "epoch": 2.1513513513513516, - "grad_norm": 3.4031248092651367, - "learning_rate": 4.453065237142592e-06, - "loss": 0.3455, - "step": 398 - }, - { - "epoch": 2.156756756756757, - "grad_norm": 3.012275457382202, - "learning_rate": 4.4504122419543745e-06, - "loss": 0.4652, - "step": 399 - }, - { - "epoch": 2.1621621621621623, - "grad_norm": 3.3084208965301514, - "learning_rate": 4.4477536222832865e-06, - "loss": 0.6343, - "step": 400 - }, - { - "epoch": 2.1675675675675676, - "grad_norm": 3.115206241607666, - "learning_rate": 4.445089385796099e-06, - "loss": 0.6975, - "step": 401 - }, - { - "epoch": 2.172972972972973, - "grad_norm": 2.893930435180664, - "learning_rate": 4.442419540175778e-06, - "loss": 0.5779, - "step": 402 - }, - { - "epoch": 2.1783783783783783, - "grad_norm": 3.0549168586730957, - "learning_rate": 4.439744093121465e-06, - "loss": 0.4541, - "step": 403 - }, - { - "epoch": 2.1837837837837837, - "grad_norm": 3.1189024448394775, - "learning_rate": 4.437063052348457e-06, - "loss": 0.4078, - "step": 404 - }, - { - "epoch": 2.189189189189189, - "grad_norm": 6.644659042358398, - "learning_rate": 4.434376425588179e-06, - "loss": 0.6759, - "step": 405 - }, - { - "epoch": 2.1945945945945944, - "grad_norm": 2.807554006576538, - "learning_rate": 4.431684220588163e-06, - "loss": 0.2938, - "step": 406 - }, - { - "epoch": 2.2, - "grad_norm": 3.6900999546051025, - "learning_rate": 4.428986445112034e-06, - "loss": 0.676, - "step": 407 - }, - { - "epoch": 2.2054054054054055, - "grad_norm": 2.0721664428710938, - "learning_rate": 4.426283106939474e-06, - "loss": 0.1859, - "step": 408 - }, - { - "epoch": 2.210810810810811, - "grad_norm": 2.953388214111328, - "learning_rate": 4.423574213866209e-06, - "loss": 0.2955, - "step": 409 - }, - { - "epoch": 2.2162162162162162, - "grad_norm": 3.049050807952881, - "learning_rate": 4.420859773703985e-06, - "loss": 0.2262, - "step": 410 - }, - { - "epoch": 2.2216216216216216, - "grad_norm": 3.319796323776245, - "learning_rate": 4.418139794280542e-06, - "loss": 0.2273, - "step": 411 - }, - { - "epoch": 2.227027027027027, - "grad_norm": 2.4133522510528564, - "learning_rate": 4.415414283439595e-06, - "loss": 0.3282, - "step": 412 - }, - { - "epoch": 2.2324324324324323, - "grad_norm": 2.9842193126678467, - "learning_rate": 4.4126832490408116e-06, - "loss": 0.3651, - "step": 413 - }, - { - "epoch": 2.237837837837838, - "grad_norm": 2.759531259536743, - "learning_rate": 4.409946698959784e-06, - "loss": 0.4052, - "step": 414 - }, - { - "epoch": 2.2432432432432434, - "grad_norm": 3.045485019683838, - "learning_rate": 4.4072046410880145e-06, - "loss": 0.4638, - "step": 415 - }, - { - "epoch": 2.2486486486486488, - "grad_norm": 3.0058295726776123, - "learning_rate": 4.404457083332887e-06, - "loss": 0.517, - "step": 416 - }, - { - "epoch": 2.254054054054054, - "grad_norm": 3.025688409805298, - "learning_rate": 4.401704033617643e-06, - "loss": 0.6902, - "step": 417 - }, - { - "epoch": 2.2594594594594595, - "grad_norm": 3.3047802448272705, - "learning_rate": 4.398945499881366e-06, - "loss": 0.3552, - "step": 418 - }, - { - "epoch": 2.264864864864865, - "grad_norm": 3.0683655738830566, - "learning_rate": 4.396181490078949e-06, - "loss": 0.286, - "step": 419 - }, - { - "epoch": 2.27027027027027, - "grad_norm": 3.627681016921997, - "learning_rate": 4.393412012181082e-06, - "loss": 0.4036, - "step": 420 - }, - { - "epoch": 2.2756756756756755, - "grad_norm": 4.552238941192627, - "learning_rate": 4.390637074174219e-06, - "loss": 0.8037, - "step": 421 - }, - { - "epoch": 2.281081081081081, - "grad_norm": 2.8688855171203613, - "learning_rate": 4.387856684060561e-06, - "loss": 0.2553, - "step": 422 - }, - { - "epoch": 2.2864864864864867, - "grad_norm": 4.21850061416626, - "learning_rate": 4.385070849858033e-06, - "loss": 0.6222, - "step": 423 - }, - { - "epoch": 2.291891891891892, - "grad_norm": 3.038433790206909, - "learning_rate": 4.382279579600257e-06, - "loss": 0.5326, - "step": 424 - }, - { - "epoch": 2.2972972972972974, - "grad_norm": 3.297300338745117, - "learning_rate": 4.379482881336532e-06, - "loss": 0.5515, - "step": 425 - }, - { - "epoch": 2.3027027027027027, - "grad_norm": 7.162952423095703, - "learning_rate": 4.376680763131811e-06, - "loss": 0.6948, - "step": 426 - }, - { - "epoch": 2.308108108108108, - "grad_norm": 3.2403595447540283, - "learning_rate": 4.373873233066676e-06, - "loss": 0.2947, - "step": 427 - }, - { - "epoch": 2.3135135135135134, - "grad_norm": 3.2969906330108643, - "learning_rate": 4.371060299237315e-06, - "loss": 0.2261, - "step": 428 - }, - { - "epoch": 2.3189189189189188, - "grad_norm": 2.669058322906494, - "learning_rate": 4.368241969755499e-06, - "loss": 0.5398, - "step": 429 - }, - { - "epoch": 2.3243243243243246, - "grad_norm": 2.7643518447875977, - "learning_rate": 4.36541825274856e-06, - "loss": 0.3301, - "step": 430 - }, - { - "epoch": 2.32972972972973, - "grad_norm": 3.6037657260894775, - "learning_rate": 4.3625891563593635e-06, - "loss": 0.6064, - "step": 431 - }, - { - "epoch": 2.3351351351351353, - "grad_norm": 2.8805618286132812, - "learning_rate": 4.35975468874629e-06, - "loss": 0.3897, - "step": 432 - }, - { - "epoch": 2.3405405405405406, - "grad_norm": 2.642402172088623, - "learning_rate": 4.356914858083211e-06, - "loss": 0.271, - "step": 433 - }, - { - "epoch": 2.345945945945946, - "grad_norm": 2.916337490081787, - "learning_rate": 4.354069672559458e-06, - "loss": 0.3681, - "step": 434 - }, - { - "epoch": 2.3513513513513513, - "grad_norm": 3.3312325477600098, - "learning_rate": 4.35121914037981e-06, - "loss": 0.298, - "step": 435 - }, - { - "epoch": 2.3567567567567567, - "grad_norm": 2.980583906173706, - "learning_rate": 4.348363269764462e-06, - "loss": 0.3618, - "step": 436 - }, - { - "epoch": 2.362162162162162, - "grad_norm": 3.5010197162628174, - "learning_rate": 4.345502068949003e-06, - "loss": 0.8972, - "step": 437 - }, - { - "epoch": 2.3675675675675674, - "grad_norm": 2.7187814712524414, - "learning_rate": 4.342635546184394e-06, - "loss": 0.3939, - "step": 438 - }, - { - "epoch": 2.372972972972973, - "grad_norm": 2.8368170261383057, - "learning_rate": 4.339763709736944e-06, - "loss": 0.5462, - "step": 439 - }, - { - "epoch": 2.3783783783783785, - "grad_norm": 2.6989636421203613, - "learning_rate": 4.336886567888283e-06, - "loss": 0.5932, - "step": 440 - }, - { - "epoch": 2.383783783783784, - "grad_norm": 3.2514829635620117, - "learning_rate": 4.334004128935342e-06, - "loss": 0.4622, - "step": 441 - }, - { - "epoch": 2.389189189189189, - "grad_norm": 5.242766857147217, - "learning_rate": 4.331116401190327e-06, - "loss": 0.5997, - "step": 442 - }, - { - "epoch": 2.3945945945945946, - "grad_norm": 3.492724657058716, - "learning_rate": 4.328223392980696e-06, - "loss": 0.3072, - "step": 443 - }, - { - "epoch": 2.4, - "grad_norm": 4.074132442474365, - "learning_rate": 4.325325112649134e-06, - "loss": 0.5338, - "step": 444 - }, - { - "epoch": 2.4054054054054053, - "grad_norm": 2.7208468914031982, - "learning_rate": 4.322421568553529e-06, - "loss": 0.3266, - "step": 445 - }, - { - "epoch": 2.410810810810811, - "grad_norm": 2.929180383682251, - "learning_rate": 4.3195127690669494e-06, - "loss": 0.4064, - "step": 446 - }, - { - "epoch": 2.4162162162162164, - "grad_norm": 2.848353624343872, - "learning_rate": 4.3165987225776186e-06, - "loss": 0.3856, - "step": 447 - }, - { - "epoch": 2.4216216216216218, - "grad_norm": 3.946488618850708, - "learning_rate": 4.313679437488889e-06, - "loss": 0.4261, - "step": 448 - }, - { - "epoch": 2.427027027027027, - "grad_norm": 5.781888961791992, - "learning_rate": 4.310754922219223e-06, - "loss": 0.4943, - "step": 449 - }, - { - "epoch": 2.4324324324324325, - "grad_norm": 2.8406941890716553, - "learning_rate": 4.307825185202164e-06, - "loss": 0.2874, - "step": 450 - }, - { - "epoch": 2.437837837837838, - "grad_norm": 3.2017335891723633, - "learning_rate": 4.3048902348863116e-06, - "loss": 0.4218, - "step": 451 - }, - { - "epoch": 2.443243243243243, - "grad_norm": 3.8355906009674072, - "learning_rate": 4.301950079735303e-06, - "loss": 0.4204, - "step": 452 - }, - { - "epoch": 2.4486486486486485, - "grad_norm": 4.783357620239258, - "learning_rate": 4.299004728227782e-06, - "loss": 0.5593, - "step": 453 - }, - { - "epoch": 2.454054054054054, - "grad_norm": 3.014080762863159, - "learning_rate": 4.2960541888573774e-06, - "loss": 0.4187, - "step": 454 - }, - { - "epoch": 2.4594594594594597, - "grad_norm": 3.5906598567962646, - "learning_rate": 4.29309847013268e-06, - "loss": 0.4193, - "step": 455 - }, - { - "epoch": 2.464864864864865, - "grad_norm": 3.9043331146240234, - "learning_rate": 4.290137580577216e-06, - "loss": 0.7035, - "step": 456 - }, - { - "epoch": 2.4702702702702704, - "grad_norm": 3.139753580093384, - "learning_rate": 4.287171528729423e-06, - "loss": 0.5877, - "step": 457 - }, - { - "epoch": 2.4756756756756757, - "grad_norm": 2.9091074466705322, - "learning_rate": 4.284200323142623e-06, - "loss": 0.5309, - "step": 458 - }, - { - "epoch": 2.481081081081081, - "grad_norm": 3.1253795623779297, - "learning_rate": 4.281223972385004e-06, - "loss": 0.448, - "step": 459 - }, - { - "epoch": 2.4864864864864864, - "grad_norm": 2.65510892868042, - "learning_rate": 4.27824248503959e-06, - "loss": 0.4453, - "step": 460 - }, - { - "epoch": 2.4918918918918918, - "grad_norm": 3.2135510444641113, - "learning_rate": 4.275255869704214e-06, - "loss": 0.5582, - "step": 461 - }, - { - "epoch": 2.4972972972972975, - "grad_norm": 2.452545404434204, - "learning_rate": 4.272264134991503e-06, - "loss": 0.423, - "step": 462 - }, - { - "epoch": 2.5027027027027025, - "grad_norm": 2.6370208263397217, - "learning_rate": 4.269267289528843e-06, - "loss": 0.271, - "step": 463 - }, - { - "epoch": 2.5081081081081082, - "grad_norm": 3.31266450881958, - "learning_rate": 4.266265341958356e-06, - "loss": 0.6459, - "step": 464 - }, - { - "epoch": 2.5135135135135136, - "grad_norm": 3.2743148803710938, - "learning_rate": 4.263258300936882e-06, - "loss": 0.2959, - "step": 465 - }, - { - "epoch": 2.518918918918919, - "grad_norm": 2.883549690246582, - "learning_rate": 4.260246175135948e-06, - "loss": 0.3418, - "step": 466 - }, - { - "epoch": 2.5243243243243243, - "grad_norm": 2.7019498348236084, - "learning_rate": 4.257228973241742e-06, - "loss": 0.3459, - "step": 467 - }, - { - "epoch": 2.5297297297297296, - "grad_norm": 3.8166959285736084, - "learning_rate": 4.254206703955092e-06, - "loss": 0.4769, - "step": 468 - }, - { - "epoch": 2.535135135135135, - "grad_norm": 3.264763593673706, - "learning_rate": 4.251179375991438e-06, - "loss": 0.6487, - "step": 469 - }, - { - "epoch": 2.5405405405405403, - "grad_norm": 2.7936933040618896, - "learning_rate": 4.248146998080808e-06, - "loss": 0.5547, - "step": 470 - }, - { - "epoch": 2.545945945945946, - "grad_norm": 3.21852707862854, - "learning_rate": 4.2451095789677945e-06, - "loss": 0.2965, - "step": 471 - }, - { - "epoch": 2.5513513513513515, - "grad_norm": 3.4528985023498535, - "learning_rate": 4.242067127411525e-06, - "loss": 0.3831, - "step": 472 - }, - { - "epoch": 2.556756756756757, - "grad_norm": 4.317023754119873, - "learning_rate": 4.239019652185642e-06, - "loss": 0.1756, - "step": 473 - }, - { - "epoch": 2.562162162162162, - "grad_norm": 3.677452325820923, - "learning_rate": 4.2359671620782725e-06, - "loss": 0.5136, - "step": 474 - }, - { - "epoch": 2.5675675675675675, - "grad_norm": 3.7563393115997314, - "learning_rate": 4.232909665892005e-06, - "loss": 0.6554, - "step": 475 - }, - { - "epoch": 2.572972972972973, - "grad_norm": 3.5125508308410645, - "learning_rate": 4.229847172443866e-06, - "loss": 0.3804, - "step": 476 - }, - { - "epoch": 2.5783783783783782, - "grad_norm": 2.8835806846618652, - "learning_rate": 4.2267796905652926e-06, - "loss": 0.3338, - "step": 477 - }, - { - "epoch": 2.583783783783784, - "grad_norm": 3.2136261463165283, - "learning_rate": 4.223707229102105e-06, - "loss": 0.6163, - "step": 478 - }, - { - "epoch": 2.589189189189189, - "grad_norm": 3.467475175857544, - "learning_rate": 4.220629796914487e-06, - "loss": 0.3005, - "step": 479 - }, - { - "epoch": 2.5945945945945947, - "grad_norm": 3.597490072250366, - "learning_rate": 4.217547402876954e-06, - "loss": 0.56, - "step": 480 - }, - { - "epoch": 2.6, - "grad_norm": 3.2377140522003174, - "learning_rate": 4.214460055878329e-06, - "loss": 0.4512, - "step": 481 - }, - { - "epoch": 2.6054054054054054, - "grad_norm": 2.577746868133545, - "learning_rate": 4.211367764821722e-06, - "loss": 0.3074, - "step": 482 - }, - { - "epoch": 2.610810810810811, - "grad_norm": 3.6584155559539795, - "learning_rate": 4.208270538624497e-06, - "loss": 0.6752, - "step": 483 - }, - { - "epoch": 2.616216216216216, - "grad_norm": 2.602778434753418, - "learning_rate": 4.205168386218251e-06, - "loss": 0.2347, - "step": 484 - }, - { - "epoch": 2.6216216216216215, - "grad_norm": 3.587503433227539, - "learning_rate": 4.2020613165487865e-06, - "loss": 0.5189, - "step": 485 - }, - { - "epoch": 2.627027027027027, - "grad_norm": 3.9341986179351807, - "learning_rate": 4.198949338576086e-06, - "loss": 0.7739, - "step": 486 - }, - { - "epoch": 2.6324324324324326, - "grad_norm": 2.9211957454681396, - "learning_rate": 4.1958324612742875e-06, - "loss": 0.3495, - "step": 487 - }, - { - "epoch": 2.637837837837838, - "grad_norm": 3.29193115234375, - "learning_rate": 4.1927106936316564e-06, - "loss": 0.2257, - "step": 488 - }, - { - "epoch": 2.6432432432432433, - "grad_norm": 3.3687057495117188, - "learning_rate": 4.189584044650559e-06, - "loss": 0.6708, - "step": 489 - }, - { - "epoch": 2.6486486486486487, - "grad_norm": 3.096428155899048, - "learning_rate": 4.186452523347441e-06, - "loss": 0.3126, - "step": 490 - }, - { - "epoch": 2.654054054054054, - "grad_norm": 3.0865559577941895, - "learning_rate": 4.183316138752799e-06, - "loss": 0.4219, - "step": 491 - }, - { - "epoch": 2.6594594594594594, - "grad_norm": 3.389827013015747, - "learning_rate": 4.180174899911149e-06, - "loss": 0.3937, - "step": 492 - }, - { - "epoch": 2.6648648648648647, - "grad_norm": 3.044360637664795, - "learning_rate": 4.177028815881012e-06, - "loss": 0.4098, - "step": 493 - }, - { - "epoch": 2.6702702702702705, - "grad_norm": 2.813094139099121, - "learning_rate": 4.173877895734875e-06, - "loss": 0.3597, - "step": 494 - }, - { - "epoch": 2.6756756756756754, - "grad_norm": 2.4037158489227295, - "learning_rate": 4.1707221485591764e-06, - "loss": 0.3284, - "step": 495 - }, - { - "epoch": 2.6810810810810812, - "grad_norm": 3.049436092376709, - "learning_rate": 4.167561583454272e-06, - "loss": 0.257, - "step": 496 - }, - { - "epoch": 2.6864864864864866, - "grad_norm": 3.458923816680908, - "learning_rate": 4.164396209534411e-06, - "loss": 0.1819, - "step": 497 - }, - { - "epoch": 2.691891891891892, - "grad_norm": 3.3084232807159424, - "learning_rate": 4.161226035927711e-06, - "loss": 0.7109, - "step": 498 - }, - { - "epoch": 2.6972972972972973, - "grad_norm": 3.034550189971924, - "learning_rate": 4.15805107177613e-06, - "loss": 0.6297, - "step": 499 - }, - { - "epoch": 2.7027027027027026, - "grad_norm": 3.5786449909210205, - "learning_rate": 4.15487132623544e-06, - "loss": 0.5195, - "step": 500 - }, - { - "epoch": 2.708108108108108, - "grad_norm": 3.4477646350860596, - "learning_rate": 4.151686808475204e-06, - "loss": 0.2528, - "step": 501 - }, - { - "epoch": 2.7135135135135133, - "grad_norm": 3.0256869792938232, - "learning_rate": 4.148497527678744e-06, - "loss": 0.5013, - "step": 502 - }, - { - "epoch": 2.718918918918919, - "grad_norm": 2.875121593475342, - "learning_rate": 4.145303493043118e-06, - "loss": 0.4109, - "step": 503 - }, - { - "epoch": 2.7243243243243245, - "grad_norm": 2.7204222679138184, - "learning_rate": 4.1421047137790935e-06, - "loss": 0.3197, - "step": 504 - }, - { - "epoch": 2.72972972972973, - "grad_norm": 3.350482702255249, - "learning_rate": 4.13890119911112e-06, - "loss": 0.6369, - "step": 505 - }, - { - "epoch": 2.735135135135135, - "grad_norm": 3.096774101257324, - "learning_rate": 4.135692958277303e-06, - "loss": 0.4581, - "step": 506 - }, - { - "epoch": 2.7405405405405405, - "grad_norm": 2.8896536827087402, - "learning_rate": 4.132480000529375e-06, - "loss": 0.6217, - "step": 507 - }, - { - "epoch": 2.745945945945946, - "grad_norm": 2.643932580947876, - "learning_rate": 4.129262335132676e-06, - "loss": 0.4951, - "step": 508 - }, - { - "epoch": 2.7513513513513512, - "grad_norm": 2.6077864170074463, - "learning_rate": 4.126039971366114e-06, - "loss": 0.2185, - "step": 509 - }, - { - "epoch": 2.756756756756757, - "grad_norm": 2.531507968902588, - "learning_rate": 4.122812918522154e-06, - "loss": 0.5428, - "step": 510 - }, - { - "epoch": 2.762162162162162, - "grad_norm": 4.125836372375488, - "learning_rate": 4.119581185906776e-06, - "loss": 0.5466, - "step": 511 - }, - { - "epoch": 2.7675675675675677, - "grad_norm": 2.9921016693115234, - "learning_rate": 4.1163447828394595e-06, - "loss": 0.3803, - "step": 512 - }, - { - "epoch": 2.772972972972973, - "grad_norm": 2.9517931938171387, - "learning_rate": 4.113103718653152e-06, - "loss": 0.2722, - "step": 513 - }, - { - "epoch": 2.7783783783783784, - "grad_norm": 2.8333382606506348, - "learning_rate": 4.10985800269424e-06, - "loss": 0.333, - "step": 514 - }, - { - "epoch": 2.7837837837837838, - "grad_norm": 2.94168758392334, - "learning_rate": 4.106607644322529e-06, - "loss": 0.2186, - "step": 515 - }, - { - "epoch": 2.789189189189189, - "grad_norm": 3.2743892669677734, - "learning_rate": 4.103352652911207e-06, - "loss": 0.6365, - "step": 516 - }, - { - "epoch": 2.7945945945945945, - "grad_norm": 4.692770004272461, - "learning_rate": 4.100093037846825e-06, - "loss": 0.7261, - "step": 517 - }, - { - "epoch": 2.8, - "grad_norm": 3.2157247066497803, - "learning_rate": 4.0968288085292675e-06, - "loss": 0.2767, - "step": 518 - }, - { - "epoch": 2.8054054054054056, - "grad_norm": 3.196887731552124, - "learning_rate": 4.093559974371725e-06, - "loss": 0.4743, - "step": 519 - }, - { - "epoch": 2.810810810810811, - "grad_norm": 2.406752586364746, - "learning_rate": 4.090286544800667e-06, - "loss": 0.3789, - "step": 520 - }, - { - "epoch": 2.8162162162162163, - "grad_norm": 3.1769447326660156, - "learning_rate": 4.087008529255815e-06, - "loss": 0.6252, - "step": 521 - }, - { - "epoch": 2.8216216216216217, - "grad_norm": 3.068370819091797, - "learning_rate": 4.083725937190115e-06, - "loss": 0.3467, - "step": 522 - }, - { - "epoch": 2.827027027027027, - "grad_norm": 3.2665855884552, - "learning_rate": 4.0804387780697114e-06, - "loss": 0.3857, - "step": 523 - }, - { - "epoch": 2.8324324324324324, - "grad_norm": 3.368759870529175, - "learning_rate": 4.077147061373918e-06, - "loss": 0.4679, - "step": 524 - }, - { - "epoch": 2.8378378378378377, - "grad_norm": 3.989163875579834, - "learning_rate": 4.073850796595192e-06, - "loss": 0.2439, - "step": 525 - }, - { - "epoch": 2.8432432432432435, - "grad_norm": 3.6244685649871826, - "learning_rate": 4.070549993239106e-06, - "loss": 0.435, - "step": 526 - }, - { - "epoch": 2.8486486486486484, - "grad_norm": 3.585151195526123, - "learning_rate": 4.06724466082432e-06, - "loss": 0.5022, - "step": 527 - }, - { - "epoch": 2.854054054054054, - "grad_norm": 3.2420976161956787, - "learning_rate": 4.063934808882555e-06, - "loss": 0.4282, - "step": 528 - }, - { - "epoch": 2.8594594594594596, - "grad_norm": 3.1674294471740723, - "learning_rate": 4.0606204469585656e-06, - "loss": 0.3436, - "step": 529 - }, - { - "epoch": 2.864864864864865, - "grad_norm": 2.6856706142425537, - "learning_rate": 4.057301584610112e-06, - "loss": 0.3889, - "step": 530 - }, - { - "epoch": 2.8702702702702703, - "grad_norm": 3.0438942909240723, - "learning_rate": 4.053978231407931e-06, - "loss": 0.4828, - "step": 531 - }, - { - "epoch": 2.8756756756756756, - "grad_norm": 3.3561246395111084, - "learning_rate": 4.0506503969357115e-06, - "loss": 0.5814, - "step": 532 - }, - { - "epoch": 2.881081081081081, - "grad_norm": 2.5318350791931152, - "learning_rate": 4.047318090790065e-06, - "loss": 0.4768, - "step": 533 - }, - { - "epoch": 2.8864864864864863, - "grad_norm": 2.587224006652832, - "learning_rate": 4.043981322580498e-06, - "loss": 0.4262, - "step": 534 - }, - { - "epoch": 2.891891891891892, - "grad_norm": 2.73926043510437, - "learning_rate": 4.040640101929384e-06, - "loss": 0.421, - "step": 535 - }, - { - "epoch": 2.8972972972972975, - "grad_norm": 3.53908371925354, - "learning_rate": 4.037294438471936e-06, - "loss": 0.4019, - "step": 536 - }, - { - "epoch": 2.902702702702703, - "grad_norm": 3.0980448722839355, - "learning_rate": 4.033944341856181e-06, - "loss": 0.4322, - "step": 537 - }, - { - "epoch": 2.908108108108108, - "grad_norm": 2.9265666007995605, - "learning_rate": 4.030589821742926e-06, - "loss": 0.3841, - "step": 538 - }, - { - "epoch": 2.9135135135135135, - "grad_norm": 3.4082043170928955, - "learning_rate": 4.0272308878057385e-06, - "loss": 0.7083, - "step": 539 - }, - { - "epoch": 2.918918918918919, - "grad_norm": 3.297515630722046, - "learning_rate": 4.023867549730912e-06, - "loss": 0.5688, - "step": 540 - }, - { - "epoch": 2.924324324324324, - "grad_norm": 3.0538225173950195, - "learning_rate": 4.020499817217441e-06, - "loss": 0.5979, - "step": 541 - }, - { - "epoch": 2.92972972972973, - "grad_norm": 3.1792757511138916, - "learning_rate": 4.017127699976992e-06, - "loss": 0.5034, - "step": 542 - }, - { - "epoch": 2.935135135135135, - "grad_norm": 3.1574482917785645, - "learning_rate": 4.013751207733877e-06, - "loss": 0.6656, - "step": 543 - }, - { - "epoch": 2.9405405405405407, - "grad_norm": 2.523123264312744, - "learning_rate": 4.010370350225023e-06, - "loss": 0.2789, - "step": 544 - }, - { - "epoch": 2.945945945945946, - "grad_norm": 3.1950793266296387, - "learning_rate": 4.006985137199945e-06, - "loss": 0.2163, - "step": 545 - }, - { - "epoch": 2.9513513513513514, - "grad_norm": 3.2089648246765137, - "learning_rate": 4.00359557842072e-06, - "loss": 0.4179, - "step": 546 - }, - { - "epoch": 2.9567567567567568, - "grad_norm": 3.852578639984131, - "learning_rate": 4.000201683661958e-06, - "loss": 0.4683, - "step": 547 - }, - { - "epoch": 2.962162162162162, - "grad_norm": 2.7612597942352295, - "learning_rate": 3.996803462710766e-06, - "loss": 0.3506, - "step": 548 - }, - { - "epoch": 2.9675675675675675, - "grad_norm": 4.811823844909668, - "learning_rate": 3.993400925366736e-06, - "loss": 0.6582, - "step": 549 - }, - { - "epoch": 2.972972972972973, - "grad_norm": 3.0135858058929443, - "learning_rate": 3.989994081441902e-06, - "loss": 0.504, - "step": 550 - }, - { - "epoch": 2.9783783783783786, - "grad_norm": 2.710277795791626, - "learning_rate": 3.986582940760717e-06, - "loss": 0.7362, - "step": 551 - }, - { - "epoch": 2.983783783783784, - "grad_norm": 3.175443649291992, - "learning_rate": 3.983167513160025e-06, - "loss": 0.4116, - "step": 552 - }, - { - "epoch": 2.9891891891891893, - "grad_norm": 3.101109743118286, - "learning_rate": 3.979747808489036e-06, - "loss": 0.2188, - "step": 553 - }, - { - "epoch": 2.9945945945945946, - "grad_norm": 3.2320079803466797, - "learning_rate": 3.976323836609289e-06, - "loss": 0.7558, - "step": 554 - }, - { - "epoch": 3.0, - "grad_norm": 3.6071934700012207, - "learning_rate": 3.9728956073946305e-06, - "loss": 0.6491, - "step": 555 - }, - { - "epoch": 3.0054054054054054, - "grad_norm": 3.1119353771209717, - "learning_rate": 3.969463130731183e-06, - "loss": 0.1625, - "step": 556 - }, - { - "epoch": 3.0108108108108107, - "grad_norm": 3.0440328121185303, - "learning_rate": 3.966026416517321e-06, - "loss": 0.311, - "step": 557 - }, - { - "epoch": 3.016216216216216, - "grad_norm": 4.069122791290283, - "learning_rate": 3.962585474663636e-06, - "loss": 0.5299, - "step": 558 - }, - { - "epoch": 3.0216216216216214, - "grad_norm": 2.878645896911621, - "learning_rate": 3.959140315092911e-06, - "loss": 0.2718, - "step": 559 - }, - { - "epoch": 3.027027027027027, - "grad_norm": 3.526695966720581, - "learning_rate": 3.955690947740092e-06, - "loss": 0.2954, - "step": 560 - }, - { - "epoch": 3.0324324324324325, - "grad_norm": 3.25087308883667, - "learning_rate": 3.95223738255226e-06, - "loss": 0.2388, - "step": 561 - }, - { - "epoch": 3.037837837837838, - "grad_norm": 3.5467700958251953, - "learning_rate": 3.9487796294886015e-06, - "loss": 0.2014, - "step": 562 - }, - { - "epoch": 3.0432432432432432, - "grad_norm": 4.397517681121826, - "learning_rate": 3.945317698520379e-06, - "loss": 0.2102, - "step": 563 - }, - { - "epoch": 3.0486486486486486, - "grad_norm": 3.7297182083129883, - "learning_rate": 3.941851599630903e-06, - "loss": 0.499, - "step": 564 - }, - { - "epoch": 3.054054054054054, - "grad_norm": 4.417158603668213, - "learning_rate": 3.938381342815503e-06, - "loss": 0.3392, - "step": 565 - }, - { - "epoch": 3.0594594594594593, - "grad_norm": 4.6037421226501465, - "learning_rate": 3.934906938081499e-06, - "loss": 0.1942, - "step": 566 - }, - { - "epoch": 3.064864864864865, - "grad_norm": 3.5600531101226807, - "learning_rate": 3.931428395448174e-06, - "loss": 0.1753, - "step": 567 - }, - { - "epoch": 3.0702702702702704, - "grad_norm": 2.868013381958008, - "learning_rate": 3.927945724946743e-06, - "loss": 0.2959, - "step": 568 - }, - { - "epoch": 3.075675675675676, - "grad_norm": 3.5543227195739746, - "learning_rate": 3.924458936620322e-06, - "loss": 0.4625, - "step": 569 - }, - { - "epoch": 3.081081081081081, - "grad_norm": 8.972922325134277, - "learning_rate": 3.920968040523904e-06, - "loss": 0.2571, - "step": 570 - }, - { - "epoch": 3.0864864864864865, - "grad_norm": 3.037388324737549, - "learning_rate": 3.917473046724329e-06, - "loss": 0.1438, - "step": 571 - }, - { - "epoch": 3.091891891891892, - "grad_norm": 3.3261702060699463, - "learning_rate": 3.9139739653002525e-06, - "loss": 0.3572, - "step": 572 - }, - { - "epoch": 3.097297297297297, - "grad_norm": 2.425293207168579, - "learning_rate": 3.910470806342117e-06, - "loss": 0.165, - "step": 573 - }, - { - "epoch": 3.1027027027027025, - "grad_norm": 3.5718603134155273, - "learning_rate": 3.9069635799521245e-06, - "loss": 0.3209, - "step": 574 - }, - { - "epoch": 3.108108108108108, - "grad_norm": 3.8211171627044678, - "learning_rate": 3.903452296244204e-06, - "loss": 0.1976, - "step": 575 - }, - { - "epoch": 3.1135135135135137, - "grad_norm": 5.944535255432129, - "learning_rate": 3.899936965343989e-06, - "loss": 0.6074, - "step": 576 - }, - { - "epoch": 3.118918918918919, - "grad_norm": 6.603860378265381, - "learning_rate": 3.89641759738878e-06, - "loss": 0.4051, - "step": 577 - }, - { - "epoch": 3.1243243243243244, - "grad_norm": 6.712981700897217, - "learning_rate": 3.892894202527523e-06, - "loss": 0.3787, - "step": 578 - }, - { - "epoch": 3.1297297297297297, - "grad_norm": 3.267186403274536, - "learning_rate": 3.8893667909207735e-06, - "loss": 0.0927, - "step": 579 - }, - { - "epoch": 3.135135135135135, - "grad_norm": 4.476837158203125, - "learning_rate": 3.88583537274067e-06, - "loss": 0.4706, - "step": 580 - }, - { - "epoch": 3.1405405405405404, - "grad_norm": 4.272335052490234, - "learning_rate": 3.8822999581709085e-06, - "loss": 0.3949, - "step": 581 - }, - { - "epoch": 3.145945945945946, - "grad_norm": 3.6685309410095215, - "learning_rate": 3.878760557406708e-06, - "loss": 0.1971, - "step": 582 - }, - { - "epoch": 3.1513513513513516, - "grad_norm": 3.9899449348449707, - "learning_rate": 3.875217180654779e-06, - "loss": 0.5156, - "step": 583 - }, - { - "epoch": 3.156756756756757, - "grad_norm": 3.866804361343384, - "learning_rate": 3.871669838133303e-06, - "loss": 0.3552, - "step": 584 - }, - { - "epoch": 3.1621621621621623, - "grad_norm": 3.565648317337036, - "learning_rate": 3.868118540071894e-06, - "loss": 0.4369, - "step": 585 - }, - { - "epoch": 3.1675675675675676, - "grad_norm": 3.5073986053466797, - "learning_rate": 3.8645632967115755e-06, - "loss": 0.3694, - "step": 586 - }, - { - "epoch": 3.172972972972973, - "grad_norm": 3.7636868953704834, - "learning_rate": 3.861004118304746e-06, - "loss": 0.3404, - "step": 587 - }, - { - "epoch": 3.1783783783783783, - "grad_norm": 2.940094232559204, - "learning_rate": 3.857441015115154e-06, - "loss": 0.3086, - "step": 588 - }, - { - "epoch": 3.1837837837837837, - "grad_norm": 3.727414608001709, - "learning_rate": 3.8538739974178635e-06, - "loss": 0.253, - "step": 589 - }, - { - "epoch": 3.189189189189189, - "grad_norm": 3.5140156745910645, - "learning_rate": 3.850303075499227e-06, - "loss": 0.2436, - "step": 590 - }, - { - "epoch": 3.1945945945945944, - "grad_norm": 3.545952558517456, - "learning_rate": 3.84672825965686e-06, - "loss": 0.328, - "step": 591 - }, - { - "epoch": 3.2, - "grad_norm": 3.534240484237671, - "learning_rate": 3.843149560199601e-06, - "loss": 0.2687, - "step": 592 - }, - { - "epoch": 3.2054054054054055, - "grad_norm": 2.8464927673339844, - "learning_rate": 3.839566987447492e-06, - "loss": 0.1417, - "step": 593 - }, - { - "epoch": 3.210810810810811, - "grad_norm": 4.138559818267822, - "learning_rate": 3.835980551731743e-06, - "loss": 0.2106, - "step": 594 - }, - { - "epoch": 3.2162162162162162, - "grad_norm": 2.917670249938965, - "learning_rate": 3.8323902633947045e-06, - "loss": 0.3154, - "step": 595 - }, - { - "epoch": 3.2216216216216216, - "grad_norm": 3.029660224914551, - "learning_rate": 3.828796132789835e-06, - "loss": 0.1218, - "step": 596 - }, - { - "epoch": 3.227027027027027, - "grad_norm": 3.2845771312713623, - "learning_rate": 3.825198170281677e-06, - "loss": 0.1336, - "step": 597 - }, - { - "epoch": 3.2324324324324323, - "grad_norm": 3.1375670433044434, - "learning_rate": 3.821596386245819e-06, - "loss": 0.2518, - "step": 598 - }, - { - "epoch": 3.237837837837838, - "grad_norm": 3.0021941661834717, - "learning_rate": 3.817990791068874e-06, - "loss": 0.2762, - "step": 599 - }, - { - "epoch": 3.2432432432432434, - "grad_norm": 4.141000747680664, - "learning_rate": 3.81438139514844e-06, - "loss": 0.2722, - "step": 600 - }, - { - "epoch": 3.2486486486486488, - "grad_norm": 3.9065279960632324, - "learning_rate": 3.8107682088930797e-06, - "loss": 0.3542, - "step": 601 - }, - { - "epoch": 3.254054054054054, - "grad_norm": 3.718417167663574, - "learning_rate": 3.807151242722286e-06, - "loss": 0.344, - "step": 602 - }, - { - "epoch": 3.2594594594594595, - "grad_norm": 4.013717174530029, - "learning_rate": 3.8035305070664484e-06, - "loss": 0.1625, - "step": 603 - }, - { - "epoch": 3.264864864864865, - "grad_norm": 3.348888397216797, - "learning_rate": 3.7999060123668318e-06, - "loss": 0.2925, - "step": 604 - }, - { - "epoch": 3.27027027027027, - "grad_norm": 3.496079206466675, - "learning_rate": 3.7962777690755364e-06, - "loss": 0.1523, - "step": 605 - }, - { - "epoch": 3.2756756756756755, - "grad_norm": 3.07607102394104, - "learning_rate": 3.792645787655476e-06, - "loss": 0.1674, - "step": 606 - }, - { - "epoch": 3.281081081081081, - "grad_norm": 3.4036154747009277, - "learning_rate": 3.7890100785803425e-06, - "loss": 0.2856, - "step": 607 - }, - { - "epoch": 3.2864864864864867, - "grad_norm": 6.092559337615967, - "learning_rate": 3.785370652334577e-06, - "loss": 0.1094, - "step": 608 - }, - { - "epoch": 3.291891891891892, - "grad_norm": 3.9322001934051514, - "learning_rate": 3.7817275194133403e-06, - "loss": 0.2611, - "step": 609 - }, - { - "epoch": 3.2972972972972974, - "grad_norm": 3.189563274383545, - "learning_rate": 3.778080690322483e-06, - "loss": 0.1315, - "step": 610 - }, - { - "epoch": 3.3027027027027027, - "grad_norm": 4.304934024810791, - "learning_rate": 3.774430175578514e-06, - "loss": 0.1686, - "step": 611 - }, - { - "epoch": 3.308108108108108, - "grad_norm": 2.9030067920684814, - "learning_rate": 3.7707759857085706e-06, - "loss": 0.4642, - "step": 612 - }, - { - "epoch": 3.3135135135135134, - "grad_norm": 3.7485930919647217, - "learning_rate": 3.7671181312503886e-06, - "loss": 0.1987, - "step": 613 - }, - { - "epoch": 3.3189189189189188, - "grad_norm": 3.4700896739959717, - "learning_rate": 3.763456622752271e-06, - "loss": 0.3307, - "step": 614 - }, - { - "epoch": 3.3243243243243246, - "grad_norm": 3.0079376697540283, - "learning_rate": 3.7597914707730583e-06, - "loss": 0.1731, - "step": 615 - }, - { - "epoch": 3.32972972972973, - "grad_norm": 3.155235767364502, - "learning_rate": 3.7561226858820984e-06, - "loss": 0.2003, - "step": 616 - }, - { - "epoch": 3.3351351351351353, - "grad_norm": 3.847895622253418, - "learning_rate": 3.7524502786592143e-06, - "loss": 0.4014, - "step": 617 - }, - { - "epoch": 3.3405405405405406, - "grad_norm": 2.7505502700805664, - "learning_rate": 3.7487742596946753e-06, - "loss": 0.205, - "step": 618 - }, - { - "epoch": 3.345945945945946, - "grad_norm": 3.654529571533203, - "learning_rate": 3.7450946395891674e-06, - "loss": 0.2932, - "step": 619 - }, - { - "epoch": 3.3513513513513513, - "grad_norm": 2.9763967990875244, - "learning_rate": 3.7414114289537593e-06, - "loss": 0.2748, - "step": 620 - }, - { - "epoch": 3.3567567567567567, - "grad_norm": 3.889683961868286, - "learning_rate": 3.7377246384098763e-06, - "loss": 0.3665, - "step": 621 - }, - { - "epoch": 3.362162162162162, - "grad_norm": 4.193166732788086, - "learning_rate": 3.7340342785892645e-06, - "loss": 0.3453, - "step": 622 - }, - { - "epoch": 3.3675675675675674, - "grad_norm": 3.4371488094329834, - "learning_rate": 3.7303403601339646e-06, - "loss": 0.473, - "step": 623 - }, - { - "epoch": 3.372972972972973, - "grad_norm": 3.6939027309417725, - "learning_rate": 3.726642893696279e-06, - "loss": 0.3017, - "step": 624 - }, - { - "epoch": 3.3783783783783785, - "grad_norm": 4.904304504394531, - "learning_rate": 3.7229418899387414e-06, - "loss": 0.4841, - "step": 625 - }, - { - "epoch": 3.383783783783784, - "grad_norm": 3.6373438835144043, - "learning_rate": 3.719237359534087e-06, - "loss": 0.3879, - "step": 626 - }, - { - "epoch": 3.389189189189189, - "grad_norm": 3.403676986694336, - "learning_rate": 3.71552931316522e-06, - "loss": 0.3876, - "step": 627 - }, - { - "epoch": 3.3945945945945946, - "grad_norm": 3.2292237281799316, - "learning_rate": 3.7118177615251834e-06, - "loss": 0.4491, - "step": 628 - }, - { - "epoch": 3.4, - "grad_norm": 3.317850351333618, - "learning_rate": 3.70810271531713e-06, - "loss": 0.3763, - "step": 629 - }, - { - "epoch": 3.4054054054054053, - "grad_norm": 3.664735794067383, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.4171, - "step": 630 - }, - { - "epoch": 3.410810810810811, - "grad_norm": 3.781569242477417, - "learning_rate": 3.700662182059936e-06, - "loss": 0.2445, - "step": 631 - }, - { - "epoch": 3.4162162162162164, - "grad_norm": 2.878260850906372, - "learning_rate": 3.696936716467363e-06, - "loss": 0.1347, - "step": 632 - }, - { - "epoch": 3.4216216216216218, - "grad_norm": 2.8670761585235596, - "learning_rate": 3.693207799219846e-06, - "loss": 0.2822, - "step": 633 - }, - { - "epoch": 3.427027027027027, - "grad_norm": 3.9338245391845703, - "learning_rate": 3.689475441070615e-06, - "loss": 0.3425, - "step": 634 - }, - { - "epoch": 3.4324324324324325, - "grad_norm": 3.3172149658203125, - "learning_rate": 3.685739652782822e-06, - "loss": 0.3315, - "step": 635 - }, - { - "epoch": 3.437837837837838, - "grad_norm": 3.9986648559570312, - "learning_rate": 3.682000445129512e-06, - "loss": 0.1841, - "step": 636 - }, - { - "epoch": 3.443243243243243, - "grad_norm": 3.4503986835479736, - "learning_rate": 3.6782578288935896e-06, - "loss": 0.3151, - "step": 637 - }, - { - "epoch": 3.4486486486486485, - "grad_norm": 3.8826167583465576, - "learning_rate": 3.6745118148677882e-06, - "loss": 0.1272, - "step": 638 - }, - { - "epoch": 3.454054054054054, - "grad_norm": 3.0585904121398926, - "learning_rate": 3.6707624138546414e-06, - "loss": 0.2436, - "step": 639 - }, - { - "epoch": 3.4594594594594597, - "grad_norm": 3.8409557342529297, - "learning_rate": 3.6670096366664477e-06, - "loss": 0.6321, - "step": 640 - }, - { - "epoch": 3.464864864864865, - "grad_norm": 3.7260093688964844, - "learning_rate": 3.663253494125244e-06, - "loss": 0.1262, - "step": 641 - }, - { - "epoch": 3.4702702702702704, - "grad_norm": 3.195587396621704, - "learning_rate": 3.6594939970627706e-06, - "loss": 0.2669, - "step": 642 - }, - { - "epoch": 3.4756756756756757, - "grad_norm": 2.565070629119873, - "learning_rate": 3.655731156320441e-06, - "loss": 0.1228, - "step": 643 - }, - { - "epoch": 3.481081081081081, - "grad_norm": 3.745422124862671, - "learning_rate": 3.651964982749312e-06, - "loss": 0.1759, - "step": 644 - }, - { - "epoch": 3.4864864864864864, - "grad_norm": 4.96168327331543, - "learning_rate": 3.648195487210051e-06, - "loss": 0.5677, - "step": 645 - }, - { - "epoch": 3.4918918918918918, - "grad_norm": 3.514446496963501, - "learning_rate": 3.644422680572906e-06, - "loss": 0.1874, - "step": 646 - }, - { - "epoch": 3.4972972972972975, - "grad_norm": 3.1427719593048096, - "learning_rate": 3.640646573717671e-06, - "loss": 0.3225, - "step": 647 - }, - { - "epoch": 3.5027027027027025, - "grad_norm": 3.32208514213562, - "learning_rate": 3.63686717753366e-06, - "loss": 0.102, - "step": 648 - }, - { - "epoch": 3.5081081081081082, - "grad_norm": 3.409299373626709, - "learning_rate": 3.6330845029196697e-06, - "loss": 0.1585, - "step": 649 - }, - { - "epoch": 3.5135135135135136, - "grad_norm": 2.827052116394043, - "learning_rate": 3.629298560783952e-06, - "loss": 0.3046, - "step": 650 - }, - { - "epoch": 3.518918918918919, - "grad_norm": 3.541518211364746, - "learning_rate": 3.6255093620441835e-06, - "loss": 0.2037, - "step": 651 - }, - { - "epoch": 3.5243243243243243, - "grad_norm": 3.067040205001831, - "learning_rate": 3.6217169176274293e-06, - "loss": 0.1784, - "step": 652 - }, - { - "epoch": 3.5297297297297296, - "grad_norm": 4.001040935516357, - "learning_rate": 3.6179212384701146e-06, - "loss": 0.1974, - "step": 653 - }, - { - "epoch": 3.535135135135135, - "grad_norm": 4.03037691116333, - "learning_rate": 3.6141223355179946e-06, - "loss": 0.2161, - "step": 654 - }, - { - "epoch": 3.5405405405405403, - "grad_norm": 3.303591728210449, - "learning_rate": 3.610320219726118e-06, - "loss": 0.1487, - "step": 655 - }, - { - "epoch": 3.545945945945946, - "grad_norm": 4.183008193969727, - "learning_rate": 3.606514902058802e-06, - "loss": 0.2231, - "step": 656 - }, - { - "epoch": 3.5513513513513515, - "grad_norm": 4.2100300788879395, - "learning_rate": 3.602706393489594e-06, - "loss": 0.5068, - "step": 657 - }, - { - "epoch": 3.556756756756757, - "grad_norm": 4.521003246307373, - "learning_rate": 3.598894705001246e-06, - "loss": 0.4621, - "step": 658 - }, - { - "epoch": 3.562162162162162, - "grad_norm": 3.452348470687866, - "learning_rate": 3.5950798475856783e-06, - "loss": 0.285, - "step": 659 - }, - { - "epoch": 3.5675675675675675, - "grad_norm": 3.468987464904785, - "learning_rate": 3.5912618322439487e-06, - "loss": 0.4277, - "step": 660 - }, - { - "epoch": 3.572972972972973, - "grad_norm": 3.431551933288574, - "learning_rate": 3.587440669986224e-06, - "loss": 0.1993, - "step": 661 - }, - { - "epoch": 3.5783783783783782, - "grad_norm": 3.017648220062256, - "learning_rate": 3.5836163718317453e-06, - "loss": 0.272, - "step": 662 - }, - { - "epoch": 3.583783783783784, - "grad_norm": 3.837244987487793, - "learning_rate": 3.5797889488087946e-06, - "loss": 0.6019, - "step": 663 - }, - { - "epoch": 3.589189189189189, - "grad_norm": 3.221762180328369, - "learning_rate": 3.575958411954668e-06, - "loss": 0.3603, - "step": 664 - }, - { - "epoch": 3.5945945945945947, - "grad_norm": 4.279484272003174, - "learning_rate": 3.5721247723156393e-06, - "loss": 0.4656, - "step": 665 - }, - { - "epoch": 3.6, - "grad_norm": 3.723459243774414, - "learning_rate": 3.5682880409469316e-06, - "loss": 0.2466, - "step": 666 - }, - { - "epoch": 3.6054054054054054, - "grad_norm": 2.7260632514953613, - "learning_rate": 3.564448228912682e-06, - "loss": 0.1848, - "step": 667 - }, - { - "epoch": 3.610810810810811, - "grad_norm": 3.6656649112701416, - "learning_rate": 3.5606053472859124e-06, - "loss": 0.4968, - "step": 668 - }, - { - "epoch": 3.616216216216216, - "grad_norm": 4.570294380187988, - "learning_rate": 3.556759407148496e-06, - "loss": 0.316, - "step": 669 - }, - { - "epoch": 3.6216216216216215, - "grad_norm": 3.174433946609497, - "learning_rate": 3.5529104195911258e-06, - "loss": 0.2232, - "step": 670 - }, - { - "epoch": 3.627027027027027, - "grad_norm": 4.481954574584961, - "learning_rate": 3.549058395713285e-06, - "loss": 0.4435, - "step": 671 - }, - { - "epoch": 3.6324324324324326, - "grad_norm": 3.8758301734924316, - "learning_rate": 3.54520334662321e-06, - "loss": 0.1455, - "step": 672 - }, - { - "epoch": 3.637837837837838, - "grad_norm": 3.1699628829956055, - "learning_rate": 3.5413452834378626e-06, - "loss": 0.3037, - "step": 673 - }, - { - "epoch": 3.6432432432432433, - "grad_norm": 3.8971962928771973, - "learning_rate": 3.5374842172828953e-06, - "loss": 0.4309, - "step": 674 - }, - { - "epoch": 3.6486486486486487, - "grad_norm": 3.3087549209594727, - "learning_rate": 3.533620159292621e-06, - "loss": 0.383, - "step": 675 - }, - { - "epoch": 3.654054054054054, - "grad_norm": 2.9413082599639893, - "learning_rate": 3.529753120609982e-06, - "loss": 0.1963, - "step": 676 - }, - { - "epoch": 3.6594594594594594, - "grad_norm": 3.309837818145752, - "learning_rate": 3.5258831123865136e-06, - "loss": 0.1922, - "step": 677 - }, - { - "epoch": 3.6648648648648647, - "grad_norm": 4.124879360198975, - "learning_rate": 3.5220101457823147e-06, - "loss": 0.5589, - "step": 678 - }, - { - "epoch": 3.6702702702702705, - "grad_norm": 3.2587103843688965, - "learning_rate": 3.5181342319660174e-06, - "loss": 0.1757, - "step": 679 - }, - { - "epoch": 3.6756756756756754, - "grad_norm": 4.179666042327881, - "learning_rate": 3.5142553821147498e-06, - "loss": 0.1208, - "step": 680 - }, - { - "epoch": 3.6810810810810812, - "grad_norm": 3.4041192531585693, - "learning_rate": 3.5103736074141106e-06, - "loss": 0.2416, - "step": 681 - }, - { - "epoch": 3.6864864864864866, - "grad_norm": 4.982706546783447, - "learning_rate": 3.5064889190581293e-06, - "loss": 0.3841, - "step": 682 - }, - { - "epoch": 3.691891891891892, - "grad_norm": 3.5895309448242188, - "learning_rate": 3.5026013282492406e-06, - "loss": 0.3723, - "step": 683 - }, - { - "epoch": 3.6972972972972973, - "grad_norm": 3.4824306964874268, - "learning_rate": 3.498710846198247e-06, - "loss": 0.4403, - "step": 684 - }, - { - "epoch": 3.7027027027027026, - "grad_norm": 3.501023054122925, - "learning_rate": 3.494817484124289e-06, - "loss": 0.2813, - "step": 685 - }, - { - "epoch": 3.708108108108108, - "grad_norm": 3.934908151626587, - "learning_rate": 3.490921253254813e-06, - "loss": 0.4287, - "step": 686 - }, - { - "epoch": 3.7135135135135133, - "grad_norm": 3.24141526222229, - "learning_rate": 3.487022164825539e-06, - "loss": 0.234, - "step": 687 - }, - { - "epoch": 3.718918918918919, - "grad_norm": 3.3419880867004395, - "learning_rate": 3.4831202300804246e-06, - "loss": 0.2135, - "step": 688 - }, - { - "epoch": 3.7243243243243245, - "grad_norm": 3.923778772354126, - "learning_rate": 3.479215460271638e-06, - "loss": 0.2725, - "step": 689 - }, - { - "epoch": 3.72972972972973, - "grad_norm": 3.2432096004486084, - "learning_rate": 3.475307866659522e-06, - "loss": 0.228, - "step": 690 - }, - { - "epoch": 3.735135135135135, - "grad_norm": 3.0307705402374268, - "learning_rate": 3.4713974605125634e-06, - "loss": 0.0985, - "step": 691 - }, - { - "epoch": 3.7405405405405405, - "grad_norm": 2.778942346572876, - "learning_rate": 3.4674842531073587e-06, - "loss": 0.2137, - "step": 692 - }, - { - "epoch": 3.745945945945946, - "grad_norm": 3.711315155029297, - "learning_rate": 3.4635682557285833e-06, - "loss": 0.1707, - "step": 693 - }, - { - "epoch": 3.7513513513513512, - "grad_norm": 3.165668487548828, - "learning_rate": 3.459649479668956e-06, - "loss": 0.3021, - "step": 694 - }, - { - "epoch": 3.756756756756757, - "grad_norm": 3.7491254806518555, - "learning_rate": 3.4557279362292117e-06, - "loss": 0.3457, - "step": 695 - }, - { - "epoch": 3.762162162162162, - "grad_norm": 3.271603584289551, - "learning_rate": 3.451803636718064e-06, - "loss": 0.1193, - "step": 696 - }, - { - "epoch": 3.7675675675675677, - "grad_norm": 3.872382402420044, - "learning_rate": 3.447876592452174e-06, - "loss": 0.2261, - "step": 697 - }, - { - "epoch": 3.772972972972973, - "grad_norm": 4.634008407592773, - "learning_rate": 3.4439468147561196e-06, - "loss": 0.5042, - "step": 698 - }, - { - "epoch": 3.7783783783783784, - "grad_norm": 3.6930148601531982, - "learning_rate": 3.440014314962358e-06, - "loss": 0.3481, - "step": 699 - }, - { - "epoch": 3.7837837837837838, - "grad_norm": 4.709466457366943, - "learning_rate": 3.4360791044112e-06, - "loss": 0.2317, - "step": 700 - }, - { - "epoch": 3.789189189189189, - "grad_norm": 4.37923002243042, - "learning_rate": 3.432141194450772e-06, - "loss": 0.395, - "step": 701 - }, - { - "epoch": 3.7945945945945945, - "grad_norm": 3.1600489616394043, - "learning_rate": 3.4282005964369836e-06, - "loss": 0.1767, - "step": 702 - }, - { - "epoch": 3.8, - "grad_norm": 3.9799487590789795, - "learning_rate": 3.424257321733497e-06, - "loss": 0.2146, - "step": 703 - }, - { - "epoch": 3.8054054054054056, - "grad_norm": 2.79176664352417, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.1534, - "step": 704 - }, - { - "epoch": 3.810810810810811, - "grad_norm": 3.0024254322052, - "learning_rate": 3.4163627877506434e-06, - "loss": 0.2513, - "step": 705 - }, - { - "epoch": 3.8162162162162163, - "grad_norm": 2.924475908279419, - "learning_rate": 3.4124115512370636e-06, - "loss": 0.4154, - "step": 706 - }, - { - "epoch": 3.8216216216216217, - "grad_norm": 3.2713992595672607, - "learning_rate": 3.408457683565295e-06, - "loss": 0.1822, - "step": 707 - }, - { - "epoch": 3.827027027027027, - "grad_norm": 3.094003438949585, - "learning_rate": 3.4045011961372675e-06, - "loss": 0.3589, - "step": 708 - }, - { - "epoch": 3.8324324324324324, - "grad_norm": 3.423858404159546, - "learning_rate": 3.4005421003624637e-06, - "loss": 0.4615, - "step": 709 - }, - { - "epoch": 3.8378378378378377, - "grad_norm": 2.038792848587036, - "learning_rate": 3.3965804076578896e-06, - "loss": 0.1001, - "step": 710 - }, - { - "epoch": 3.8432432432432435, - "grad_norm": 2.6447055339813232, - "learning_rate": 3.392616129448039e-06, - "loss": 0.2788, - "step": 711 - }, - { - "epoch": 3.8486486486486484, - "grad_norm": 3.546876907348633, - "learning_rate": 3.3886492771648593e-06, - "loss": 0.2663, - "step": 712 - }, - { - "epoch": 3.854054054054054, - "grad_norm": 2.9587066173553467, - "learning_rate": 3.384679862247726e-06, - "loss": 0.3497, - "step": 713 - }, - { - "epoch": 3.8594594594594596, - "grad_norm": 3.7122113704681396, - "learning_rate": 3.3807078961434013e-06, - "loss": 0.3613, - "step": 714 - }, - { - "epoch": 3.864864864864865, - "grad_norm": 3.157294988632202, - "learning_rate": 3.376733390306004e-06, - "loss": 0.0783, - "step": 715 - }, - { - "epoch": 3.8702702702702703, - "grad_norm": 3.564279317855835, - "learning_rate": 3.372756356196979e-06, - "loss": 0.1617, - "step": 716 - }, - { - "epoch": 3.8756756756756756, - "grad_norm": 4.231864929199219, - "learning_rate": 3.3687768052850595e-06, - "loss": 0.6444, - "step": 717 - }, - { - "epoch": 3.881081081081081, - "grad_norm": 5.480365753173828, - "learning_rate": 3.364794749046239e-06, - "loss": 0.4858, - "step": 718 - }, - { - "epoch": 3.8864864864864863, - "grad_norm": 3.428140878677368, - "learning_rate": 3.3608101989637333e-06, - "loss": 0.3103, - "step": 719 - }, - { - "epoch": 3.891891891891892, - "grad_norm": 3.521989345550537, - "learning_rate": 3.356823166527952e-06, - "loss": 0.2501, - "step": 720 - }, - { - "epoch": 3.8972972972972975, - "grad_norm": 3.287081718444824, - "learning_rate": 3.352833663236463e-06, - "loss": 0.18, - "step": 721 - }, - { - "epoch": 3.902702702702703, - "grad_norm": 3.323146104812622, - "learning_rate": 3.348841700593956e-06, - "loss": 0.12, - "step": 722 - }, - { - "epoch": 3.908108108108108, - "grad_norm": 3.516693115234375, - "learning_rate": 3.3448472901122187e-06, - "loss": 0.2618, - "step": 723 - }, - { - "epoch": 3.9135135135135135, - "grad_norm": 3.8109545707702637, - "learning_rate": 3.340850443310092e-06, - "loss": 0.3689, - "step": 724 - }, - { - "epoch": 3.918918918918919, - "grad_norm": 3.8335933685302734, - "learning_rate": 3.336851171713447e-06, - "loss": 0.2195, - "step": 725 - }, - { - "epoch": 3.924324324324324, - "grad_norm": 3.9054670333862305, - "learning_rate": 3.3328494868551444e-06, - "loss": 0.2602, - "step": 726 - }, - { - "epoch": 3.92972972972973, - "grad_norm": 3.1380631923675537, - "learning_rate": 3.3288454002750046e-06, - "loss": 0.1561, - "step": 727 - }, - { - "epoch": 3.935135135135135, - "grad_norm": 4.304198741912842, - "learning_rate": 3.3248389235197764e-06, - "loss": 0.4469, - "step": 728 - }, - { - "epoch": 3.9405405405405407, - "grad_norm": 3.3321573734283447, - "learning_rate": 3.3208300681430967e-06, - "loss": 0.2246, - "step": 729 - }, - { - "epoch": 3.945945945945946, - "grad_norm": 3.89400315284729, - "learning_rate": 3.3168188457054656e-06, - "loss": 0.2743, - "step": 730 - }, - { - "epoch": 3.9513513513513514, - "grad_norm": 3.393209934234619, - "learning_rate": 3.312805267774209e-06, - "loss": 0.551, - "step": 731 - }, - { - "epoch": 3.9567567567567568, - "grad_norm": 3.711652994155884, - "learning_rate": 3.3087893459234423e-06, - "loss": 0.3522, - "step": 732 - }, - { - "epoch": 3.962162162162162, - "grad_norm": 3.6701200008392334, - "learning_rate": 3.304771091734043e-06, - "loss": 0.3084, - "step": 733 - }, - { - "epoch": 3.9675675675675675, - "grad_norm": 3.1742889881134033, - "learning_rate": 3.300750516793614e-06, - "loss": 0.3406, - "step": 734 - }, - { - "epoch": 3.972972972972973, - "grad_norm": 4.000397682189941, - "learning_rate": 3.2967276326964504e-06, - "loss": 0.3463, - "step": 735 - }, - { - "epoch": 3.9783783783783786, - "grad_norm": 3.7932708263397217, - "learning_rate": 3.2927024510435057e-06, - "loss": 0.3758, - "step": 736 - }, - { - "epoch": 3.983783783783784, - "grad_norm": 3.6258292198181152, - "learning_rate": 3.2886749834423587e-06, - "loss": 0.3328, - "step": 737 - }, - { - "epoch": 3.9891891891891893, - "grad_norm": 4.628194332122803, - "learning_rate": 3.284645241507183e-06, - "loss": 0.6213, - "step": 738 - }, - { - "epoch": 3.9945945945945946, - "grad_norm": 4.173697471618652, - "learning_rate": 3.280613236858707e-06, - "loss": 0.2463, - "step": 739 - }, - { - "epoch": 4.0, - "grad_norm": 2.9315719604492188, - "learning_rate": 3.2765789811241865e-06, - "loss": 0.3501, - "step": 740 - }, - { - "epoch": 4.005405405405406, - "grad_norm": 3.7292938232421875, - "learning_rate": 3.272542485937369e-06, - "loss": 0.1753, - "step": 741 - }, - { - "epoch": 4.010810810810811, - "grad_norm": 3.627298593521118, - "learning_rate": 3.2685037629384587e-06, - "loss": 0.0722, - "step": 742 - }, - { - "epoch": 4.0162162162162165, - "grad_norm": 3.7558975219726562, - "learning_rate": 3.264462823774085e-06, - "loss": 0.2475, - "step": 743 - }, - { - "epoch": 4.021621621621621, - "grad_norm": 2.991217851638794, - "learning_rate": 3.260419680097268e-06, - "loss": 0.1163, - "step": 744 - }, - { - "epoch": 4.027027027027027, - "grad_norm": 3.315901517868042, - "learning_rate": 3.2563743435673855e-06, - "loss": 0.1325, - "step": 745 - }, - { - "epoch": 4.032432432432432, - "grad_norm": 2.9405429363250732, - "learning_rate": 3.252326825850139e-06, - "loss": 0.0466, - "step": 746 - }, - { - "epoch": 4.037837837837838, - "grad_norm": 4.078726291656494, - "learning_rate": 3.2482771386175173e-06, - "loss": 0.1861, - "step": 747 - }, - { - "epoch": 4.043243243243243, - "grad_norm": 3.6752545833587646, - "learning_rate": 3.24422529354777e-06, - "loss": 0.1637, - "step": 748 - }, - { - "epoch": 4.048648648648649, - "grad_norm": 4.471213340759277, - "learning_rate": 3.2401713023253646e-06, - "loss": 0.1379, - "step": 749 - }, - { - "epoch": 4.054054054054054, - "grad_norm": 4.609938144683838, - "learning_rate": 3.2361151766409628e-06, - "loss": 0.1099, - "step": 750 - }, - { - "epoch": 4.059459459459459, - "grad_norm": 3.7480030059814453, - "learning_rate": 3.232056928191376e-06, - "loss": 0.1422, - "step": 751 - }, - { - "epoch": 4.064864864864865, - "grad_norm": 4.23753547668457, - "learning_rate": 3.2279965686795424e-06, - "loss": 0.2716, - "step": 752 - }, - { - "epoch": 4.07027027027027, - "grad_norm": 4.59039306640625, - "learning_rate": 3.2239341098144833e-06, - "loss": 0.3849, - "step": 753 - }, - { - "epoch": 4.075675675675676, - "grad_norm": 2.9332475662231445, - "learning_rate": 3.219869563311277e-06, - "loss": 0.0768, - "step": 754 - }, - { - "epoch": 4.081081081081081, - "grad_norm": 3.8387272357940674, - "learning_rate": 3.2158029408910213e-06, - "loss": 0.112, - "step": 755 - }, - { - "epoch": 4.0864864864864865, - "grad_norm": 2.5676164627075195, - "learning_rate": 3.2117342542807995e-06, - "loss": 0.1054, - "step": 756 - }, - { - "epoch": 4.091891891891892, - "grad_norm": 3.4695913791656494, - "learning_rate": 3.207663515213648e-06, - "loss": 0.1754, - "step": 757 - }, - { - "epoch": 4.097297297297297, - "grad_norm": 3.531060218811035, - "learning_rate": 3.2035907354285234e-06, - "loss": 0.191, - "step": 758 - }, - { - "epoch": 4.102702702702703, - "grad_norm": 3.8944122791290283, - "learning_rate": 3.1995159266702648e-06, - "loss": 0.1083, - "step": 759 - }, - { - "epoch": 4.108108108108108, - "grad_norm": 3.572751998901367, - "learning_rate": 3.1954391006895635e-06, - "loss": 0.0609, - "step": 760 - }, - { - "epoch": 4.113513513513514, - "grad_norm": 3.533867120742798, - "learning_rate": 3.191360269242928e-06, - "loss": 0.049, - "step": 761 - }, - { - "epoch": 4.118918918918919, - "grad_norm": 3.742013454437256, - "learning_rate": 3.18727944409265e-06, - "loss": 0.1642, - "step": 762 - }, - { - "epoch": 4.124324324324324, - "grad_norm": 3.918525457382202, - "learning_rate": 3.1831966370067714e-06, - "loss": 0.1513, - "step": 763 - }, - { - "epoch": 4.12972972972973, - "grad_norm": 4.906899929046631, - "learning_rate": 3.1791118597590467e-06, - "loss": 0.3276, - "step": 764 - }, - { - "epoch": 4.135135135135135, - "grad_norm": 5.704930305480957, - "learning_rate": 3.1750251241289148e-06, - "loss": 0.4011, - "step": 765 - }, - { - "epoch": 4.140540540540541, - "grad_norm": 4.278724193572998, - "learning_rate": 3.1709364419014615e-06, - "loss": 0.2274, - "step": 766 - }, - { - "epoch": 4.145945945945946, - "grad_norm": 3.7831263542175293, - "learning_rate": 3.166845824867384e-06, - "loss": 0.118, - "step": 767 - }, - { - "epoch": 4.151351351351352, - "grad_norm": 3.6355350017547607, - "learning_rate": 3.162753284822962e-06, - "loss": 0.1109, - "step": 768 - }, - { - "epoch": 4.1567567567567565, - "grad_norm": 4.063662052154541, - "learning_rate": 3.1586588335700176e-06, - "loss": 0.1754, - "step": 769 - }, - { - "epoch": 4.162162162162162, - "grad_norm": 3.404348611831665, - "learning_rate": 3.1545624829158873e-06, - "loss": 0.1155, - "step": 770 - }, - { - "epoch": 4.167567567567567, - "grad_norm": 2.7452480792999268, - "learning_rate": 3.1504642446733828e-06, - "loss": 0.0635, - "step": 771 - }, - { - "epoch": 4.172972972972973, - "grad_norm": 2.4755163192749023, - "learning_rate": 3.146364130660761e-06, - "loss": 0.1068, - "step": 772 - }, - { - "epoch": 4.178378378378379, - "grad_norm": 3.0338311195373535, - "learning_rate": 3.142262152701685e-06, - "loss": 0.0637, - "step": 773 - }, - { - "epoch": 4.183783783783784, - "grad_norm": 4.566886901855469, - "learning_rate": 3.138158322625197e-06, - "loss": 0.2703, - "step": 774 - }, - { - "epoch": 4.1891891891891895, - "grad_norm": 4.614205360412598, - "learning_rate": 3.1340526522656765e-06, - "loss": 0.2769, - "step": 775 - }, - { - "epoch": 4.194594594594594, - "grad_norm": 3.4197700023651123, - "learning_rate": 3.1299451534628134e-06, - "loss": 0.1192, - "step": 776 - }, - { - "epoch": 4.2, - "grad_norm": 3.2838752269744873, - "learning_rate": 3.1258358380615674e-06, - "loss": 0.1244, - "step": 777 - }, - { - "epoch": 4.205405405405405, - "grad_norm": 4.484423637390137, - "learning_rate": 3.121724717912138e-06, - "loss": 0.2819, - "step": 778 - }, - { - "epoch": 4.210810810810811, - "grad_norm": 2.6898670196533203, - "learning_rate": 3.1176118048699283e-06, - "loss": 0.1018, - "step": 779 - }, - { - "epoch": 4.216216216216216, - "grad_norm": 3.3304710388183594, - "learning_rate": 3.113497110795514e-06, - "loss": 0.1842, - "step": 780 - }, - { - "epoch": 4.221621621621622, - "grad_norm": 3.29425311088562, - "learning_rate": 3.1093806475546046e-06, - "loss": 0.2299, - "step": 781 - }, - { - "epoch": 4.227027027027027, - "grad_norm": 3.0818686485290527, - "learning_rate": 3.1052624270180116e-06, - "loss": 0.1397, - "step": 782 - }, - { - "epoch": 4.232432432432432, - "grad_norm": 4.569559097290039, - "learning_rate": 3.1011424610616153e-06, - "loss": 0.2236, - "step": 783 - }, - { - "epoch": 4.237837837837838, - "grad_norm": 3.2377943992614746, - "learning_rate": 3.097020761566328e-06, - "loss": 0.1417, - "step": 784 - }, - { - "epoch": 4.243243243243243, - "grad_norm": 5.442404270172119, - "learning_rate": 3.092897340418062e-06, - "loss": 0.1317, - "step": 785 - }, - { - "epoch": 4.248648648648649, - "grad_norm": 4.14007568359375, - "learning_rate": 3.088772209507694e-06, - "loss": 0.1869, - "step": 786 - }, - { - "epoch": 4.254054054054054, - "grad_norm": 3.024740695953369, - "learning_rate": 3.0846453807310317e-06, - "loss": 0.0967, - "step": 787 - }, - { - "epoch": 4.2594594594594595, - "grad_norm": 3.463261365890503, - "learning_rate": 3.080516865988778e-06, - "loss": 0.0731, - "step": 788 - }, - { - "epoch": 4.264864864864865, - "grad_norm": 3.398139715194702, - "learning_rate": 3.076386677186498e-06, - "loss": 0.1912, - "step": 789 - }, - { - "epoch": 4.27027027027027, - "grad_norm": 3.934204339981079, - "learning_rate": 3.0722548262345854e-06, - "loss": 0.2133, - "step": 790 - }, - { - "epoch": 4.275675675675676, - "grad_norm": 5.5322041511535645, - "learning_rate": 3.0681213250482255e-06, - "loss": 0.4454, - "step": 791 - }, - { - "epoch": 4.281081081081081, - "grad_norm": 5.381092071533203, - "learning_rate": 3.0639861855473637e-06, - "loss": 0.3645, - "step": 792 - }, - { - "epoch": 4.286486486486487, - "grad_norm": 4.104682445526123, - "learning_rate": 3.05984941965667e-06, - "loss": 0.1331, - "step": 793 - }, - { - "epoch": 4.291891891891892, - "grad_norm": 3.032749652862549, - "learning_rate": 3.055711039305503e-06, - "loss": 0.0863, - "step": 794 - }, - { - "epoch": 4.297297297297297, - "grad_norm": 3.1181957721710205, - "learning_rate": 3.051571056427879e-06, - "loss": 0.1988, - "step": 795 - }, - { - "epoch": 4.302702702702703, - "grad_norm": 4.8824944496154785, - "learning_rate": 3.047429482962433e-06, - "loss": 0.2307, - "step": 796 - }, - { - "epoch": 4.308108108108108, - "grad_norm": 3.5564794540405273, - "learning_rate": 3.0432863308523903e-06, - "loss": 0.1614, - "step": 797 - }, - { - "epoch": 4.313513513513514, - "grad_norm": 2.928267240524292, - "learning_rate": 3.039141612045525e-06, - "loss": 0.0683, - "step": 798 - }, - { - "epoch": 4.318918918918919, - "grad_norm": 2.846242666244507, - "learning_rate": 3.034995338494131e-06, - "loss": 0.1784, - "step": 799 - }, - { - "epoch": 4.324324324324325, - "grad_norm": 2.8273985385894775, - "learning_rate": 3.0308475221549868e-06, - "loss": 0.0451, - "step": 800 - }, - { - "epoch": 4.3297297297297295, - "grad_norm": 3.0229880809783936, - "learning_rate": 3.026698174989316e-06, - "loss": 0.0618, - "step": 801 - }, - { - "epoch": 4.335135135135135, - "grad_norm": 3.555338144302368, - "learning_rate": 3.0225473089627617e-06, - "loss": 0.1529, - "step": 802 - }, - { - "epoch": 4.34054054054054, - "grad_norm": 3.7206318378448486, - "learning_rate": 3.0183949360453442e-06, - "loss": 0.4177, - "step": 803 - }, - { - "epoch": 4.345945945945946, - "grad_norm": 4.038993835449219, - "learning_rate": 3.014241068211428e-06, - "loss": 0.1394, - "step": 804 - }, - { - "epoch": 4.351351351351352, - "grad_norm": 3.723766565322876, - "learning_rate": 3.0100857174396926e-06, - "loss": 0.04, - "step": 805 - }, - { - "epoch": 4.356756756756757, - "grad_norm": 4.745445728302002, - "learning_rate": 3.0059288957130893e-06, - "loss": 0.2705, - "step": 806 - }, - { - "epoch": 4.3621621621621625, - "grad_norm": 3.245249032974243, - "learning_rate": 3.001770615018815e-06, - "loss": 0.2208, - "step": 807 - }, - { - "epoch": 4.367567567567567, - "grad_norm": 4.631863594055176, - "learning_rate": 2.9976108873482725e-06, - "loss": 0.2068, - "step": 808 - }, - { - "epoch": 4.372972972972973, - "grad_norm": 3.4944963455200195, - "learning_rate": 2.9934497246970357e-06, - "loss": 0.1253, - "step": 809 - }, - { - "epoch": 4.378378378378378, - "grad_norm": 3.393252372741699, - "learning_rate": 2.989287139064819e-06, - "loss": 0.1721, - "step": 810 - }, - { - "epoch": 4.383783783783784, - "grad_norm": 3.2354531288146973, - "learning_rate": 2.9851231424554385e-06, - "loss": 0.134, - "step": 811 - }, - { - "epoch": 4.389189189189189, - "grad_norm": 3.8997225761413574, - "learning_rate": 2.9809577468767813e-06, - "loss": 0.0818, - "step": 812 - }, - { - "epoch": 4.394594594594595, - "grad_norm": 3.4745192527770996, - "learning_rate": 2.9767909643407676e-06, - "loss": 0.1797, - "step": 813 - }, - { - "epoch": 4.4, - "grad_norm": 2.8166556358337402, - "learning_rate": 2.9726228068633155e-06, - "loss": 0.145, - "step": 814 - }, - { - "epoch": 4.405405405405405, - "grad_norm": 3.4947283267974854, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.079, - "step": 815 - }, - { - "epoch": 4.410810810810811, - "grad_norm": 3.8058624267578125, - "learning_rate": 2.9642824151675702e-06, - "loss": 0.1763, - "step": 816 - }, - { - "epoch": 4.416216216216216, - "grad_norm": 3.161440134048462, - "learning_rate": 2.9601102050008016e-06, - "loss": 0.2654, - "step": 817 - }, - { - "epoch": 4.421621621621622, - "grad_norm": 2.7620294094085693, - "learning_rate": 2.955936667995578e-06, - "loss": 0.0779, - "step": 818 - }, - { - "epoch": 4.427027027027027, - "grad_norm": 3.2293593883514404, - "learning_rate": 2.9517618161872974e-06, - "loss": 0.0587, - "step": 819 - }, - { - "epoch": 4.4324324324324325, - "grad_norm": 2.753647565841675, - "learning_rate": 2.9475856616151487e-06, - "loss": 0.0835, - "step": 820 - }, - { - "epoch": 4.437837837837838, - "grad_norm": 3.744755744934082, - "learning_rate": 2.9434082163220773e-06, - "loss": 0.1748, - "step": 821 - }, - { - "epoch": 4.443243243243243, - "grad_norm": 3.5458850860595703, - "learning_rate": 2.9392294923547543e-06, - "loss": 0.119, - "step": 822 - }, - { - "epoch": 4.448648648648649, - "grad_norm": 4.037010192871094, - "learning_rate": 2.9350495017635334e-06, - "loss": 0.1535, - "step": 823 - }, - { - "epoch": 4.454054054054054, - "grad_norm": 3.704439401626587, - "learning_rate": 2.9308682566024228e-06, - "loss": 0.2561, - "step": 824 - }, - { - "epoch": 4.45945945945946, - "grad_norm": 2.9537882804870605, - "learning_rate": 2.92668576892905e-06, - "loss": 0.2024, - "step": 825 - }, - { - "epoch": 4.464864864864865, - "grad_norm": 3.1923575401306152, - "learning_rate": 2.9225020508046233e-06, - "loss": 0.0436, - "step": 826 - }, - { - "epoch": 4.47027027027027, - "grad_norm": 3.304884195327759, - "learning_rate": 2.9183171142939002e-06, - "loss": 0.1636, - "step": 827 - }, - { - "epoch": 4.475675675675676, - "grad_norm": 3.5481832027435303, - "learning_rate": 2.9141309714651528e-06, - "loss": 0.0962, - "step": 828 - }, - { - "epoch": 4.481081081081081, - "grad_norm": 4.0650153160095215, - "learning_rate": 2.9099436343901306e-06, - "loss": 0.2129, - "step": 829 - }, - { - "epoch": 4.486486486486487, - "grad_norm": 4.274670124053955, - "learning_rate": 2.9057551151440266e-06, - "loss": 0.2872, - "step": 830 - }, - { - "epoch": 4.491891891891892, - "grad_norm": 4.45655632019043, - "learning_rate": 2.9015654258054433e-06, - "loss": 0.3254, - "step": 831 - }, - { - "epoch": 4.4972972972972975, - "grad_norm": 3.2205746173858643, - "learning_rate": 2.8973745784563596e-06, - "loss": 0.1417, - "step": 832 - }, - { - "epoch": 4.5027027027027025, - "grad_norm": 3.994489908218384, - "learning_rate": 2.8931825851820904e-06, - "loss": 0.2513, - "step": 833 - }, - { - "epoch": 4.508108108108108, - "grad_norm": 2.8250539302825928, - "learning_rate": 2.8889894580712574e-06, - "loss": 0.1785, - "step": 834 - }, - { - "epoch": 4.513513513513513, - "grad_norm": 3.526552200317383, - "learning_rate": 2.884795209215751e-06, - "loss": 0.2853, - "step": 835 - }, - { - "epoch": 4.518918918918919, - "grad_norm": 3.8975565433502197, - "learning_rate": 2.880599850710696e-06, - "loss": 0.2947, - "step": 836 - }, - { - "epoch": 4.524324324324324, - "grad_norm": 2.86104154586792, - "learning_rate": 2.8764033946544197e-06, - "loss": 0.177, - "step": 837 - }, - { - "epoch": 4.52972972972973, - "grad_norm": 3.967454433441162, - "learning_rate": 2.8722058531484105e-06, - "loss": 0.2786, - "step": 838 - }, - { - "epoch": 4.535135135135135, - "grad_norm": 3.9122490882873535, - "learning_rate": 2.86800723829729e-06, - "loss": 0.1881, - "step": 839 - }, - { - "epoch": 4.54054054054054, - "grad_norm": 3.9732089042663574, - "learning_rate": 2.8638075622087747e-06, - "loss": 0.3541, - "step": 840 - }, - { - "epoch": 4.545945945945946, - "grad_norm": 3.7056405544281006, - "learning_rate": 2.8596068369936386e-06, - "loss": 0.3094, - "step": 841 - }, - { - "epoch": 4.551351351351351, - "grad_norm": 3.5056777000427246, - "learning_rate": 2.8554050747656862e-06, - "loss": 0.1162, - "step": 842 - }, - { - "epoch": 4.556756756756757, - "grad_norm": 3.1131439208984375, - "learning_rate": 2.851202287641709e-06, - "loss": 0.1079, - "step": 843 - }, - { - "epoch": 4.562162162162162, - "grad_norm": 3.6517693996429443, - "learning_rate": 2.8469984877414525e-06, - "loss": 0.4462, - "step": 844 - }, - { - "epoch": 4.5675675675675675, - "grad_norm": 3.0627806186676025, - "learning_rate": 2.842793687187588e-06, - "loss": 0.0851, - "step": 845 - }, - { - "epoch": 4.572972972972973, - "grad_norm": 4.0370893478393555, - "learning_rate": 2.8385878981056663e-06, - "loss": 0.1268, - "step": 846 - }, - { - "epoch": 4.578378378378378, - "grad_norm": 3.486156463623047, - "learning_rate": 2.8343811326240944e-06, - "loss": 0.3187, - "step": 847 - }, - { - "epoch": 4.583783783783784, - "grad_norm": 2.4388604164123535, - "learning_rate": 2.830173402874091e-06, - "loss": 0.1315, - "step": 848 - }, - { - "epoch": 4.589189189189189, - "grad_norm": 3.5970475673675537, - "learning_rate": 2.8259647209896573e-06, - "loss": 0.301, - "step": 849 - }, - { - "epoch": 4.594594594594595, - "grad_norm": 3.657775402069092, - "learning_rate": 2.821755099107541e-06, - "loss": 0.1478, - "step": 850 - }, - { - "epoch": 4.6, - "grad_norm": 3.2040653228759766, - "learning_rate": 2.817544549367197e-06, - "loss": 0.2029, - "step": 851 - }, - { - "epoch": 4.605405405405405, - "grad_norm": 2.778747081756592, - "learning_rate": 2.813333083910761e-06, - "loss": 0.0549, - "step": 852 - }, - { - "epoch": 4.610810810810811, - "grad_norm": 3.661921977996826, - "learning_rate": 2.8091207148830046e-06, - "loss": 0.1508, - "step": 853 - }, - { - "epoch": 4.616216216216216, - "grad_norm": 2.7028398513793945, - "learning_rate": 2.8049074544313094e-06, - "loss": 0.1094, - "step": 854 - }, - { - "epoch": 4.621621621621622, - "grad_norm": 3.3319056034088135, - "learning_rate": 2.8006933147056236e-06, - "loss": 0.0799, - "step": 855 - }, - { - "epoch": 4.627027027027027, - "grad_norm": 3.3194944858551025, - "learning_rate": 2.7964783078584336e-06, - "loss": 0.123, - "step": 856 - }, - { - "epoch": 4.632432432432433, - "grad_norm": 2.4618616104125977, - "learning_rate": 2.792262446044725e-06, - "loss": 0.0692, - "step": 857 - }, - { - "epoch": 4.6378378378378375, - "grad_norm": 4.007084846496582, - "learning_rate": 2.788045741421949e-06, - "loss": 0.1596, - "step": 858 - }, - { - "epoch": 4.643243243243243, - "grad_norm": 2.6852214336395264, - "learning_rate": 2.78382820614999e-06, - "loss": 0.047, - "step": 859 - }, - { - "epoch": 4.648648648648649, - "grad_norm": 3.249666690826416, - "learning_rate": 2.779609852391123e-06, - "loss": 0.1561, - "step": 860 - }, - { - "epoch": 4.654054054054054, - "grad_norm": 7.2313337326049805, - "learning_rate": 2.775390692309987e-06, - "loss": 0.2157, - "step": 861 - }, - { - "epoch": 4.65945945945946, - "grad_norm": 3.1866044998168945, - "learning_rate": 2.7711707380735443e-06, - "loss": 0.0782, - "step": 862 - }, - { - "epoch": 4.664864864864865, - "grad_norm": 3.714812755584717, - "learning_rate": 2.766950001851049e-06, - "loss": 0.2994, - "step": 863 - }, - { - "epoch": 4.6702702702702705, - "grad_norm": 3.0355515480041504, - "learning_rate": 2.7627284958140084e-06, - "loss": 0.109, - "step": 864 - }, - { - "epoch": 4.675675675675675, - "grad_norm": 2.8177638053894043, - "learning_rate": 2.7585062321361517e-06, - "loss": 0.2557, - "step": 865 - }, - { - "epoch": 4.681081081081081, - "grad_norm": 3.7162227630615234, - "learning_rate": 2.75428322299339e-06, - "loss": 0.0413, - "step": 866 - }, - { - "epoch": 4.686486486486486, - "grad_norm": 3.008643627166748, - "learning_rate": 2.7500594805637882e-06, - "loss": 0.0402, - "step": 867 - }, - { - "epoch": 4.691891891891892, - "grad_norm": 3.1683881282806396, - "learning_rate": 2.745835017027522e-06, - "loss": 0.1481, - "step": 868 - }, - { - "epoch": 4.697297297297297, - "grad_norm": 3.2899327278137207, - "learning_rate": 2.74160984456685e-06, - "loss": 0.2242, - "step": 869 - }, - { - "epoch": 4.702702702702703, - "grad_norm": 5.386324882507324, - "learning_rate": 2.737383975366071e-06, - "loss": 0.4693, - "step": 870 - }, - { - "epoch": 4.708108108108108, - "grad_norm": 3.0007741451263428, - "learning_rate": 2.7331574216114963e-06, - "loss": 0.1353, - "step": 871 - }, - { - "epoch": 4.713513513513513, - "grad_norm": 2.7533962726593018, - "learning_rate": 2.728930195491411e-06, - "loss": 0.157, - "step": 872 - }, - { - "epoch": 4.718918918918919, - "grad_norm": 3.349351167678833, - "learning_rate": 2.724702309196038e-06, - "loss": 0.1863, - "step": 873 - }, - { - "epoch": 4.724324324324324, - "grad_norm": 3.2562623023986816, - "learning_rate": 2.720473774917505e-06, - "loss": 0.2874, - "step": 874 - }, - { - "epoch": 4.72972972972973, - "grad_norm": 3.4865262508392334, - "learning_rate": 2.716244604849807e-06, - "loss": 0.1021, - "step": 875 - }, - { - "epoch": 4.735135135135135, - "grad_norm": 3.793647289276123, - "learning_rate": 2.7120148111887732e-06, - "loss": 0.1046, - "step": 876 - }, - { - "epoch": 4.7405405405405405, - "grad_norm": 3.8841137886047363, - "learning_rate": 2.707784406132032e-06, - "loss": 0.0971, - "step": 877 - }, - { - "epoch": 4.745945945945946, - "grad_norm": 3.45615816116333, - "learning_rate": 2.703553401878972e-06, - "loss": 0.0507, - "step": 878 - }, - { - "epoch": 4.751351351351351, - "grad_norm": 3.578495502471924, - "learning_rate": 2.6993218106307146e-06, - "loss": 0.0616, - "step": 879 - }, - { - "epoch": 4.756756756756757, - "grad_norm": 4.271491527557373, - "learning_rate": 2.6950896445900685e-06, - "loss": 0.0908, - "step": 880 - }, - { - "epoch": 4.762162162162162, - "grad_norm": 3.889042615890503, - "learning_rate": 2.690856915961504e-06, - "loss": 0.2426, - "step": 881 - }, - { - "epoch": 4.767567567567568, - "grad_norm": 3.8519232273101807, - "learning_rate": 2.686623636951112e-06, - "loss": 0.1881, - "step": 882 - }, - { - "epoch": 4.772972972972973, - "grad_norm": 3.819518804550171, - "learning_rate": 2.6823898197665703e-06, - "loss": 0.1385, - "step": 883 - }, - { - "epoch": 4.778378378378378, - "grad_norm": 4.091328144073486, - "learning_rate": 2.6781554766171104e-06, - "loss": 0.2913, - "step": 884 - }, - { - "epoch": 4.783783783783784, - "grad_norm": 2.60793399810791, - "learning_rate": 2.673920619713478e-06, - "loss": 0.0874, - "step": 885 - }, - { - "epoch": 4.789189189189189, - "grad_norm": 4.59322452545166, - "learning_rate": 2.6696852612679024e-06, - "loss": 0.2703, - "step": 886 - }, - { - "epoch": 4.794594594594595, - "grad_norm": 3.4631619453430176, - "learning_rate": 2.6654494134940586e-06, - "loss": 0.121, - "step": 887 - }, - { - "epoch": 4.8, - "grad_norm": 3.8556058406829834, - "learning_rate": 2.6612130886070313e-06, - "loss": 0.1853, - "step": 888 - }, - { - "epoch": 4.805405405405406, - "grad_norm": 2.932152271270752, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.0533, - "step": 889 - }, - { - "epoch": 4.8108108108108105, - "grad_norm": 4.647441387176514, - "learning_rate": 2.652739056360618e-06, - "loss": 0.3178, - "step": 890 - }, - { - "epoch": 4.816216216216216, - "grad_norm": 4.682106018066406, - "learning_rate": 2.648501373438142e-06, - "loss": 0.1735, - "step": 891 - }, - { - "epoch": 4.821621621621622, - "grad_norm": 3.1454825401306152, - "learning_rate": 2.644263262276234e-06, - "loss": 0.062, - "step": 892 - }, - { - "epoch": 4.827027027027027, - "grad_norm": 3.579653739929199, - "learning_rate": 2.640024735096507e-06, - "loss": 0.1336, - "step": 893 - }, - { - "epoch": 4.832432432432433, - "grad_norm": 2.558265447616577, - "learning_rate": 2.6357858041217733e-06, - "loss": 0.1404, - "step": 894 - }, - { - "epoch": 4.837837837837838, - "grad_norm": 2.3879470825195312, - "learning_rate": 2.6315464815760104e-06, - "loss": 0.0373, - "step": 895 - }, - { - "epoch": 4.8432432432432435, - "grad_norm": 4.418992042541504, - "learning_rate": 2.6273067796843242e-06, - "loss": 0.3068, - "step": 896 - }, - { - "epoch": 4.848648648648648, - "grad_norm": 3.08585786819458, - "learning_rate": 2.6230667106729157e-06, - "loss": 0.2221, - "step": 897 - }, - { - "epoch": 4.854054054054054, - "grad_norm": 2.9488885402679443, - "learning_rate": 2.618826286769043e-06, - "loss": 0.1431, - "step": 898 - }, - { - "epoch": 4.859459459459459, - "grad_norm": 4.123927116394043, - "learning_rate": 2.614585520200989e-06, - "loss": 0.196, - "step": 899 - }, - { - "epoch": 4.864864864864865, - "grad_norm": 4.289125919342041, - "learning_rate": 2.6103444231980233e-06, - "loss": 0.2509, - "step": 900 - }, - { - "epoch": 4.87027027027027, - "grad_norm": 3.0358095169067383, - "learning_rate": 2.606103007990371e-06, - "loss": 0.0747, - "step": 901 - }, - { - "epoch": 4.875675675675676, - "grad_norm": 3.6471376419067383, - "learning_rate": 2.601861286809172e-06, - "loss": 0.0494, - "step": 902 - }, - { - "epoch": 4.881081081081081, - "grad_norm": 3.424712896347046, - "learning_rate": 2.5976192718864497e-06, - "loss": 0.0901, - "step": 903 - }, - { - "epoch": 4.886486486486486, - "grad_norm": 4.047586441040039, - "learning_rate": 2.593376975455075e-06, - "loss": 0.0465, - "step": 904 - }, - { - "epoch": 4.891891891891892, - "grad_norm": 4.448032379150391, - "learning_rate": 2.5891344097487294e-06, - "loss": 0.0616, - "step": 905 - }, - { - "epoch": 4.897297297297297, - "grad_norm": 3.3522684574127197, - "learning_rate": 2.584891587001872e-06, - "loss": 0.087, - "step": 906 - }, - { - "epoch": 4.902702702702703, - "grad_norm": 2.979238986968994, - "learning_rate": 2.580648519449704e-06, - "loss": 0.053, - "step": 907 - }, - { - "epoch": 4.908108108108108, - "grad_norm": 6.049450397491455, - "learning_rate": 2.5764052193281287e-06, - "loss": 0.2707, - "step": 908 - }, - { - "epoch": 4.9135135135135135, - "grad_norm": 6.647163391113281, - "learning_rate": 2.5721616988737254e-06, - "loss": 0.3679, - "step": 909 - }, - { - "epoch": 4.918918918918919, - "grad_norm": 3.764979839324951, - "learning_rate": 2.567917970323704e-06, - "loss": 0.1929, - "step": 910 - }, - { - "epoch": 4.924324324324324, - "grad_norm": 3.5592362880706787, - "learning_rate": 2.5636740459158776e-06, - "loss": 0.2461, - "step": 911 - }, - { - "epoch": 4.92972972972973, - "grad_norm": 4.4554762840271, - "learning_rate": 2.559429937888624e-06, - "loss": 0.2484, - "step": 912 - }, - { - "epoch": 4.935135135135135, - "grad_norm": 3.358375072479248, - "learning_rate": 2.5551856584808483e-06, - "loss": 0.1886, - "step": 913 - }, - { - "epoch": 4.940540540540541, - "grad_norm": 3.5831756591796875, - "learning_rate": 2.5509412199319515e-06, - "loss": 0.1789, - "step": 914 - }, - { - "epoch": 4.945945945945946, - "grad_norm": 2.4555728435516357, - "learning_rate": 2.5466966344817927e-06, - "loss": 0.1072, - "step": 915 - }, - { - "epoch": 4.951351351351351, - "grad_norm": 4.581109046936035, - "learning_rate": 2.542451914370656e-06, - "loss": 0.2624, - "step": 916 - }, - { - "epoch": 4.956756756756757, - "grad_norm": 2.9763975143432617, - "learning_rate": 2.538207071839213e-06, - "loss": 0.0639, - "step": 917 - }, - { - "epoch": 4.962162162162162, - "grad_norm": 3.516282796859741, - "learning_rate": 2.533962119128487e-06, - "loss": 0.1281, - "step": 918 - }, - { - "epoch": 4.967567567567568, - "grad_norm": 3.0369791984558105, - "learning_rate": 2.529717068479821e-06, - "loss": 0.1771, - "step": 919 - }, - { - "epoch": 4.972972972972973, - "grad_norm": 2.998521327972412, - "learning_rate": 2.5254719321348392e-06, - "loss": 0.2582, - "step": 920 - }, - { - "epoch": 4.978378378378379, - "grad_norm": 3.002901792526245, - "learning_rate": 2.5212267223354143e-06, - "loss": 0.3016, - "step": 921 - }, - { - "epoch": 4.9837837837837835, - "grad_norm": 3.564932346343994, - "learning_rate": 2.5169814513236296e-06, - "loss": 0.2775, - "step": 922 - }, - { - "epoch": 4.989189189189189, - "grad_norm": 3.726227283477783, - "learning_rate": 2.5127361313417447e-06, - "loss": 0.1246, - "step": 923 - }, - { - "epoch": 4.994594594594595, - "grad_norm": 4.766391754150391, - "learning_rate": 2.508490774632162e-06, - "loss": 0.1732, - "step": 924 - }, - { - "epoch": 5.0, - "grad_norm": 2.9859752655029297, - "learning_rate": 2.5042453934373874e-06, - "loss": 0.1107, - "step": 925 - } - ], - "logging_steps": 1, - "max_steps": 1850, - "num_input_tokens_seen": 0, - "num_train_epochs": 10, - "save_steps": 206, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 2.495984431173468e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/metallama3_8b/limo_filtered_incorrect/trainer_log.jsonl b/metallama3_8b/limo_filtered_incorrect/trainer_log.jsonl deleted file mode 100644 index 5c6f520e97e534cfb4519495aecb59859b577893..0000000000000000000000000000000000000000 --- a/metallama3_8b/limo_filtered_incorrect/trainer_log.jsonl +++ /dev/null @@ -1,1290 +0,0 @@ -{"current_steps": 1, "total_steps": 1850, "loss": 2.9165, "lr": 5e-06, "epoch": 0.005405405405405406, "percentage": 0.05, "elapsed_time": "0:00:03", "remaining_time": "1:56:42"} -{"current_steps": 2, "total_steps": 1850, "loss": 1.9314, "lr": 4.999996395324314e-06, "epoch": 0.010810810810810811, "percentage": 0.11, "elapsed_time": "0:00:07", "remaining_time": "1:49:49"} -{"current_steps": 3, "total_steps": 1850, "loss": 1.5709, "lr": 4.99998558130765e-06, "epoch": 0.016216216216216217, "percentage": 0.16, "elapsed_time": "0:00:13", "remaining_time": "2:15:03"} -{"current_steps": 4, "total_steps": 1850, "loss": 0.8099, "lr": 4.999967557981192e-06, "epoch": 0.021621621621621623, "percentage": 0.22, "elapsed_time": "0:00:15", "remaining_time": "2:02:01"} -{"current_steps": 5, "total_steps": 1850, "loss": 0.9021, "lr": 4.999942325396917e-06, "epoch": 0.02702702702702703, "percentage": 0.27, "elapsed_time": "0:00:18", "remaining_time": "1:54:34"} -{"current_steps": 6, "total_steps": 1850, "loss": 1.7972, "lr": 4.999909883627588e-06, "epoch": 0.032432432432432434, "percentage": 0.32, "elapsed_time": "0:00:23", "remaining_time": "2:00:53"} -{"current_steps": 7, "total_steps": 1850, "loss": 1.4306, "lr": 4.999870232766757e-06, "epoch": 0.03783783783783784, "percentage": 0.38, "elapsed_time": "0:00:26", "remaining_time": "1:55:27"} -{"current_steps": 8, "total_steps": 1850, "loss": 1.051, "lr": 4.9998233729287696e-06, "epoch": 0.043243243243243246, "percentage": 0.43, "elapsed_time": "0:00:29", "remaining_time": "1:53:49"} -{"current_steps": 9, "total_steps": 1850, "loss": 0.8089, "lr": 4.999769304248755e-06, "epoch": 0.04864864864864865, "percentage": 0.49, "elapsed_time": "0:00:31", "remaining_time": "1:49:02"} -{"current_steps": 10, "total_steps": 1850, "loss": 1.0999, "lr": 4.9997080268826344e-06, "epoch": 0.05405405405405406, "percentage": 0.54, "elapsed_time": "0:00:35", "remaining_time": "1:50:03"} -{"current_steps": 11, "total_steps": 1850, "loss": 1.2831, "lr": 4.9996395410071165e-06, "epoch": 0.05945945945945946, "percentage": 0.59, "elapsed_time": "0:00:39", "remaining_time": "1:49:01"} -{"current_steps": 12, "total_steps": 1850, "loss": 1.2874, "lr": 4.999563846819696e-06, "epoch": 0.06486486486486487, "percentage": 0.65, "elapsed_time": "0:00:44", "remaining_time": "1:54:29"} -{"current_steps": 13, "total_steps": 1850, "loss": 0.96, "lr": 4.999480944538655e-06, "epoch": 0.07027027027027027, "percentage": 0.7, "elapsed_time": "0:00:46", "remaining_time": "1:48:43"} -{"current_steps": 14, "total_steps": 1850, "loss": 0.9869, "lr": 4.999390834403063e-06, "epoch": 0.07567567567567568, "percentage": 0.76, "elapsed_time": "0:00:50", "remaining_time": "1:49:43"} -{"current_steps": 15, "total_steps": 1850, "loss": 0.9293, "lr": 4.999293516672773e-06, "epoch": 0.08108108108108109, "percentage": 0.81, "elapsed_time": "0:00:51", "remaining_time": "1:45:18"} -{"current_steps": 16, "total_steps": 1850, "loss": 0.8914, "lr": 4.9991889916284255e-06, "epoch": 0.08648648648648649, "percentage": 0.86, "elapsed_time": "0:00:52", "remaining_time": "1:40:36"} -{"current_steps": 17, "total_steps": 1850, "loss": 1.0176, "lr": 4.999077259571442e-06, "epoch": 0.0918918918918919, "percentage": 0.92, "elapsed_time": "0:00:53", "remaining_time": "1:36:26"} -{"current_steps": 18, "total_steps": 1850, "loss": 1.0259, "lr": 4.998958320824031e-06, "epoch": 0.0972972972972973, "percentage": 0.97, "elapsed_time": "0:00:58", "remaining_time": "1:39:01"} -{"current_steps": 19, "total_steps": 1850, "loss": 1.3356, "lr": 4.998832175729179e-06, "epoch": 0.10270270270270271, "percentage": 1.03, "elapsed_time": "0:01:01", "remaining_time": "1:39:12"} -{"current_steps": 20, "total_steps": 1850, "loss": 1.4486, "lr": 4.998698824650656e-06, "epoch": 0.10810810810810811, "percentage": 1.08, "elapsed_time": "0:01:04", "remaining_time": "1:39:07"} -{"current_steps": 21, "total_steps": 1850, "loss": 0.8372, "lr": 4.998558267973014e-06, "epoch": 0.11351351351351352, "percentage": 1.14, "elapsed_time": "0:01:08", "remaining_time": "1:38:47"} -{"current_steps": 22, "total_steps": 1850, "loss": 0.7931, "lr": 4.998410506101579e-06, "epoch": 0.11891891891891893, "percentage": 1.19, "elapsed_time": "0:01:09", "remaining_time": "1:36:45"} -{"current_steps": 23, "total_steps": 1850, "loss": 1.3022, "lr": 4.9982555394624595e-06, "epoch": 0.12432432432432433, "percentage": 1.24, "elapsed_time": "0:01:15", "remaining_time": "1:40:32"} -{"current_steps": 24, "total_steps": 1850, "loss": 0.9739, "lr": 4.998093368502539e-06, "epoch": 0.12972972972972974, "percentage": 1.3, "elapsed_time": "0:01:17", "remaining_time": "1:38:20"} -{"current_steps": 25, "total_steps": 1850, "loss": 1.1154, "lr": 4.9979239936894765e-06, "epoch": 0.13513513513513514, "percentage": 1.35, "elapsed_time": "0:01:24", "remaining_time": "1:42:18"} -{"current_steps": 26, "total_steps": 1850, "loss": 0.7543, "lr": 4.997747415511705e-06, "epoch": 0.14054054054054055, "percentage": 1.41, "elapsed_time": "0:01:27", "remaining_time": "1:42:48"} -{"current_steps": 27, "total_steps": 1850, "loss": 0.7278, "lr": 4.997563634478428e-06, "epoch": 0.14594594594594595, "percentage": 1.46, "elapsed_time": "0:01:29", "remaining_time": "1:41:05"} -{"current_steps": 28, "total_steps": 1850, "loss": 0.8167, "lr": 4.997372651119626e-06, "epoch": 0.15135135135135136, "percentage": 1.51, "elapsed_time": "0:01:33", "remaining_time": "1:41:49"} -{"current_steps": 29, "total_steps": 1850, "loss": 0.8031, "lr": 4.997174465986044e-06, "epoch": 0.15675675675675677, "percentage": 1.57, "elapsed_time": "0:01:37", "remaining_time": "1:41:53"} -{"current_steps": 30, "total_steps": 1850, "loss": 0.689, "lr": 4.996969079649196e-06, "epoch": 0.16216216216216217, "percentage": 1.62, "elapsed_time": "0:01:41", "remaining_time": "1:43:06"} -{"current_steps": 31, "total_steps": 1850, "loss": 0.8059, "lr": 4.996756492701362e-06, "epoch": 0.16756756756756758, "percentage": 1.68, "elapsed_time": "0:01:43", "remaining_time": "1:41:03"} -{"current_steps": 32, "total_steps": 1850, "loss": 0.9658, "lr": 4.996536705755591e-06, "epoch": 0.17297297297297298, "percentage": 1.73, "elapsed_time": "0:01:48", "remaining_time": "1:42:46"} -{"current_steps": 33, "total_steps": 1850, "loss": 0.8349, "lr": 4.996309719445687e-06, "epoch": 0.1783783783783784, "percentage": 1.78, "elapsed_time": "0:01:50", "remaining_time": "1:40:59"} -{"current_steps": 34, "total_steps": 1850, "loss": 0.8287, "lr": 4.996075534426223e-06, "epoch": 0.1837837837837838, "percentage": 1.84, "elapsed_time": "0:01:54", "remaining_time": "1:41:33"} -{"current_steps": 35, "total_steps": 1850, "loss": 1.1211, "lr": 4.995834151372526e-06, "epoch": 0.1891891891891892, "percentage": 1.89, "elapsed_time": "0:01:57", "remaining_time": "1:41:55"} -{"current_steps": 36, "total_steps": 1850, "loss": 1.0841, "lr": 4.995585570980685e-06, "epoch": 0.1945945945945946, "percentage": 1.95, "elapsed_time": "0:02:00", "remaining_time": "1:40:51"} -{"current_steps": 37, "total_steps": 1850, "loss": 0.6182, "lr": 4.995329793967537e-06, "epoch": 0.2, "percentage": 2.0, "elapsed_time": "0:02:01", "remaining_time": "1:38:49"} -{"current_steps": 38, "total_steps": 1850, "loss": 0.7647, "lr": 4.9950668210706795e-06, "epoch": 0.20540540540540542, "percentage": 2.05, "elapsed_time": "0:02:02", "remaining_time": "1:37:12"} -{"current_steps": 39, "total_steps": 1850, "loss": 0.8691, "lr": 4.994796653048457e-06, "epoch": 0.21081081081081082, "percentage": 2.11, "elapsed_time": "0:02:06", "remaining_time": "1:38:09"} -{"current_steps": 40, "total_steps": 1850, "loss": 1.0404, "lr": 4.994519290679965e-06, "epoch": 0.21621621621621623, "percentage": 2.16, "elapsed_time": "0:02:10", "remaining_time": "1:38:08"} -{"current_steps": 41, "total_steps": 1850, "loss": 1.1877, "lr": 4.994234734765043e-06, "epoch": 0.22162162162162163, "percentage": 2.22, "elapsed_time": "0:02:15", "remaining_time": "1:39:50"} -{"current_steps": 42, "total_steps": 1850, "loss": 0.959, "lr": 4.993942986124278e-06, "epoch": 0.22702702702702704, "percentage": 2.27, "elapsed_time": "0:02:19", "remaining_time": "1:40:25"} -{"current_steps": 43, "total_steps": 1850, "loss": 0.9249, "lr": 4.9936440455989975e-06, "epoch": 0.23243243243243245, "percentage": 2.32, "elapsed_time": "0:02:22", "remaining_time": "1:40:08"} -{"current_steps": 44, "total_steps": 1850, "loss": 0.6899, "lr": 4.993337914051266e-06, "epoch": 0.23783783783783785, "percentage": 2.38, "elapsed_time": "0:02:25", "remaining_time": "1:39:23"} -{"current_steps": 45, "total_steps": 1850, "loss": 0.9075, "lr": 4.99302459236389e-06, "epoch": 0.24324324324324326, "percentage": 2.43, "elapsed_time": "0:02:31", "remaining_time": "1:41:02"} -{"current_steps": 46, "total_steps": 1850, "loss": 0.785, "lr": 4.992704081440407e-06, "epoch": 0.24864864864864866, "percentage": 2.49, "elapsed_time": "0:02:32", "remaining_time": "1:39:44"} -{"current_steps": 47, "total_steps": 1850, "loss": 1.008, "lr": 4.992376382205088e-06, "epoch": 0.25405405405405407, "percentage": 2.54, "elapsed_time": "0:02:35", "remaining_time": "1:39:41"} -{"current_steps": 48, "total_steps": 1850, "loss": 0.7751, "lr": 4.992041495602932e-06, "epoch": 0.2594594594594595, "percentage": 2.59, "elapsed_time": "0:02:38", "remaining_time": "1:39:07"} -{"current_steps": 49, "total_steps": 1850, "loss": 0.9022, "lr": 4.991699422599664e-06, "epoch": 0.2648648648648649, "percentage": 2.65, "elapsed_time": "0:02:40", "remaining_time": "1:38:33"} -{"current_steps": 50, "total_steps": 1850, "loss": 0.8801, "lr": 4.991350164181735e-06, "epoch": 0.2702702702702703, "percentage": 2.7, "elapsed_time": "0:02:44", "remaining_time": "1:38:37"} -{"current_steps": 51, "total_steps": 1850, "loss": 0.7045, "lr": 4.990993721356317e-06, "epoch": 0.2756756756756757, "percentage": 2.76, "elapsed_time": "0:02:46", "remaining_time": "1:37:42"} -{"current_steps": 52, "total_steps": 1850, "loss": 0.7312, "lr": 4.990630095151296e-06, "epoch": 0.2810810810810811, "percentage": 2.81, "elapsed_time": "0:02:48", "remaining_time": "1:37:06"} -{"current_steps": 53, "total_steps": 1850, "loss": 0.9609, "lr": 4.9902592866152765e-06, "epoch": 0.2864864864864865, "percentage": 2.86, "elapsed_time": "0:02:51", "remaining_time": "1:36:53"} -{"current_steps": 54, "total_steps": 1850, "loss": 0.5753, "lr": 4.989881296817575e-06, "epoch": 0.2918918918918919, "percentage": 2.92, "elapsed_time": "0:02:53", "remaining_time": "1:36:16"} -{"current_steps": 55, "total_steps": 1850, "loss": 0.5118, "lr": 4.989496126848215e-06, "epoch": 0.2972972972972973, "percentage": 2.97, "elapsed_time": "0:02:55", "remaining_time": "1:35:36"} -{"current_steps": 56, "total_steps": 1850, "loss": 1.1261, "lr": 4.989103777817928e-06, "epoch": 0.3027027027027027, "percentage": 3.03, "elapsed_time": "0:03:02", "remaining_time": "1:37:15"} -{"current_steps": 57, "total_steps": 1850, "loss": 0.7823, "lr": 4.988704250858145e-06, "epoch": 0.3081081081081081, "percentage": 3.08, "elapsed_time": "0:03:04", "remaining_time": "1:36:57"} -{"current_steps": 58, "total_steps": 1850, "loss": 0.6019, "lr": 4.988297547121e-06, "epoch": 0.31351351351351353, "percentage": 3.14, "elapsed_time": "0:03:09", "remaining_time": "1:37:39"} -{"current_steps": 59, "total_steps": 1850, "loss": 0.825, "lr": 4.98788366777932e-06, "epoch": 0.31891891891891894, "percentage": 3.19, "elapsed_time": "0:03:11", "remaining_time": "1:36:47"} -{"current_steps": 60, "total_steps": 1850, "loss": 0.7667, "lr": 4.987462614026625e-06, "epoch": 0.32432432432432434, "percentage": 3.24, "elapsed_time": "0:03:13", "remaining_time": "1:36:13"} -{"current_steps": 61, "total_steps": 1850, "loss": 0.8051, "lr": 4.987034387077126e-06, "epoch": 0.32972972972972975, "percentage": 3.3, "elapsed_time": "0:03:16", "remaining_time": "1:36:16"} -{"current_steps": 62, "total_steps": 1850, "loss": 0.6895, "lr": 4.986598988165718e-06, "epoch": 0.33513513513513515, "percentage": 3.35, "elapsed_time": "0:03:19", "remaining_time": "1:36:07"} -{"current_steps": 63, "total_steps": 1850, "loss": 0.9268, "lr": 4.9861564185479785e-06, "epoch": 0.34054054054054056, "percentage": 3.41, "elapsed_time": "0:03:25", "remaining_time": "1:36:57"} -{"current_steps": 64, "total_steps": 1850, "loss": 0.9854, "lr": 4.985706679500163e-06, "epoch": 0.34594594594594597, "percentage": 3.46, "elapsed_time": "0:03:28", "remaining_time": "1:36:45"} -{"current_steps": 65, "total_steps": 1850, "loss": 0.8083, "lr": 4.9852497723192025e-06, "epoch": 0.35135135135135137, "percentage": 3.51, "elapsed_time": "0:03:29", "remaining_time": "1:35:40"} -{"current_steps": 66, "total_steps": 1850, "loss": 0.9098, "lr": 4.9847856983227e-06, "epoch": 0.3567567567567568, "percentage": 3.57, "elapsed_time": "0:03:31", "remaining_time": "1:35:11"} -{"current_steps": 67, "total_steps": 1850, "loss": 0.8881, "lr": 4.984314458848923e-06, "epoch": 0.3621621621621622, "percentage": 3.62, "elapsed_time": "0:03:34", "remaining_time": "1:35:02"} -{"current_steps": 68, "total_steps": 1850, "loss": 0.9877, "lr": 4.983836055256804e-06, "epoch": 0.3675675675675676, "percentage": 3.68, "elapsed_time": "0:03:38", "remaining_time": "1:35:16"} -{"current_steps": 69, "total_steps": 1850, "loss": 0.8282, "lr": 4.983350488925935e-06, "epoch": 0.372972972972973, "percentage": 3.73, "elapsed_time": "0:03:40", "remaining_time": "1:34:50"} -{"current_steps": 70, "total_steps": 1850, "loss": 1.1756, "lr": 4.982857761256564e-06, "epoch": 0.3783783783783784, "percentage": 3.78, "elapsed_time": "0:03:44", "remaining_time": "1:35:15"} -{"current_steps": 71, "total_steps": 1850, "loss": 0.8114, "lr": 4.982357873669589e-06, "epoch": 0.3837837837837838, "percentage": 3.84, "elapsed_time": "0:03:46", "remaining_time": "1:34:40"} -{"current_steps": 72, "total_steps": 1850, "loss": 0.6763, "lr": 4.981850827606556e-06, "epoch": 0.3891891891891892, "percentage": 3.89, "elapsed_time": "0:03:48", "remaining_time": "1:34:12"} -{"current_steps": 73, "total_steps": 1850, "loss": 0.9372, "lr": 4.981336624529655e-06, "epoch": 0.3945945945945946, "percentage": 3.95, "elapsed_time": "0:03:50", "remaining_time": "1:33:40"} -{"current_steps": 74, "total_steps": 1850, "loss": 1.0155, "lr": 4.980815265921714e-06, "epoch": 0.4, "percentage": 4.0, "elapsed_time": "0:03:53", "remaining_time": "1:33:34"} -{"current_steps": 75, "total_steps": 1850, "loss": 0.949, "lr": 4.980286753286196e-06, "epoch": 0.40540540540540543, "percentage": 4.05, "elapsed_time": "0:03:59", "remaining_time": "1:34:33"} -{"current_steps": 76, "total_steps": 1850, "loss": 1.0134, "lr": 4.979751088147192e-06, "epoch": 0.41081081081081083, "percentage": 4.11, "elapsed_time": "0:04:02", "remaining_time": "1:34:21"} -{"current_steps": 77, "total_steps": 1850, "loss": 0.9722, "lr": 4.979208272049425e-06, "epoch": 0.41621621621621624, "percentage": 4.16, "elapsed_time": "0:04:04", "remaining_time": "1:33:55"} -{"current_steps": 78, "total_steps": 1850, "loss": 1.2259, "lr": 4.978658306558235e-06, "epoch": 0.42162162162162165, "percentage": 4.22, "elapsed_time": "0:04:08", "remaining_time": "1:34:03"} -{"current_steps": 79, "total_steps": 1850, "loss": 0.834, "lr": 4.978101193259578e-06, "epoch": 0.42702702702702705, "percentage": 4.27, "elapsed_time": "0:04:09", "remaining_time": "1:33:12"} -{"current_steps": 80, "total_steps": 1850, "loss": 0.6151, "lr": 4.977536933760025e-06, "epoch": 0.43243243243243246, "percentage": 4.32, "elapsed_time": "0:04:11", "remaining_time": "1:32:49"} -{"current_steps": 81, "total_steps": 1850, "loss": 1.0475, "lr": 4.976965529686755e-06, "epoch": 0.43783783783783786, "percentage": 4.38, "elapsed_time": "0:04:15", "remaining_time": "1:32:49"} -{"current_steps": 82, "total_steps": 1850, "loss": 0.8324, "lr": 4.976386982687548e-06, "epoch": 0.44324324324324327, "percentage": 4.43, "elapsed_time": "0:04:16", "remaining_time": "1:32:20"} -{"current_steps": 83, "total_steps": 1850, "loss": 0.997, "lr": 4.9758012944307845e-06, "epoch": 0.4486486486486487, "percentage": 4.49, "elapsed_time": "0:04:22", "remaining_time": "1:33:03"} -{"current_steps": 84, "total_steps": 1850, "loss": 1.2024, "lr": 4.975208466605436e-06, "epoch": 0.4540540540540541, "percentage": 4.54, "elapsed_time": "0:04:24", "remaining_time": "1:32:49"} -{"current_steps": 85, "total_steps": 1850, "loss": 0.9146, "lr": 4.974608500921064e-06, "epoch": 0.4594594594594595, "percentage": 4.59, "elapsed_time": "0:04:27", "remaining_time": "1:32:27"} -{"current_steps": 86, "total_steps": 1850, "loss": 0.7181, "lr": 4.974001399107816e-06, "epoch": 0.4648648648648649, "percentage": 4.65, "elapsed_time": "0:04:29", "remaining_time": "1:31:58"} -{"current_steps": 87, "total_steps": 1850, "loss": 0.8599, "lr": 4.973387162916415e-06, "epoch": 0.4702702702702703, "percentage": 4.7, "elapsed_time": "0:04:33", "remaining_time": "1:32:19"} -{"current_steps": 88, "total_steps": 1850, "loss": 0.6081, "lr": 4.972765794118158e-06, "epoch": 0.4756756756756757, "percentage": 4.76, "elapsed_time": "0:04:34", "remaining_time": "1:31:29"} -{"current_steps": 89, "total_steps": 1850, "loss": 0.8764, "lr": 4.9721372945049114e-06, "epoch": 0.4810810810810811, "percentage": 4.81, "elapsed_time": "0:04:37", "remaining_time": "1:31:35"} -{"current_steps": 90, "total_steps": 1850, "loss": 0.8622, "lr": 4.971501665889107e-06, "epoch": 0.4864864864864865, "percentage": 4.86, "elapsed_time": "0:04:45", "remaining_time": "1:32:53"} -{"current_steps": 91, "total_steps": 1850, "loss": 0.5523, "lr": 4.9708589101037306e-06, "epoch": 0.4918918918918919, "percentage": 4.92, "elapsed_time": "0:04:48", "remaining_time": "1:32:52"} -{"current_steps": 92, "total_steps": 1850, "loss": 0.8922, "lr": 4.970209029002325e-06, "epoch": 0.4972972972972973, "percentage": 4.97, "elapsed_time": "0:04:55", "remaining_time": "1:34:06"} -{"current_steps": 93, "total_steps": 1850, "loss": 0.9455, "lr": 4.969552024458977e-06, "epoch": 0.5027027027027027, "percentage": 5.03, "elapsed_time": "0:04:59", "remaining_time": "1:34:25"} -{"current_steps": 94, "total_steps": 1850, "loss": 0.8342, "lr": 4.968887898368318e-06, "epoch": 0.5081081081081081, "percentage": 5.08, "elapsed_time": "0:05:05", "remaining_time": "1:35:14"} -{"current_steps": 95, "total_steps": 1850, "loss": 0.8476, "lr": 4.968216652645515e-06, "epoch": 0.5135135135135135, "percentage": 5.14, "elapsed_time": "0:05:11", "remaining_time": "1:35:56"} -{"current_steps": 96, "total_steps": 1850, "loss": 0.8879, "lr": 4.967538289226268e-06, "epoch": 0.518918918918919, "percentage": 5.19, "elapsed_time": "0:05:13", "remaining_time": "1:35:31"} -{"current_steps": 97, "total_steps": 1850, "loss": 0.7114, "lr": 4.966852810066798e-06, "epoch": 0.5243243243243243, "percentage": 5.24, "elapsed_time": "0:05:16", "remaining_time": "1:35:28"} -{"current_steps": 98, "total_steps": 1850, "loss": 0.6757, "lr": 4.9661602171438524e-06, "epoch": 0.5297297297297298, "percentage": 5.3, "elapsed_time": "0:05:18", "remaining_time": "1:34:47"} -{"current_steps": 99, "total_steps": 1850, "loss": 0.8029, "lr": 4.965460512454687e-06, "epoch": 0.5351351351351351, "percentage": 5.35, "elapsed_time": "0:05:20", "remaining_time": "1:34:27"} -{"current_steps": 100, "total_steps": 1850, "loss": 0.842, "lr": 4.964753698017071e-06, "epoch": 0.5405405405405406, "percentage": 5.41, "elapsed_time": "0:05:22", "remaining_time": "1:34:08"} -{"current_steps": 101, "total_steps": 1850, "loss": 0.6339, "lr": 4.964039775869271e-06, "epoch": 0.5459459459459459, "percentage": 5.46, "elapsed_time": "0:05:24", "remaining_time": "1:33:32"} -{"current_steps": 102, "total_steps": 1850, "loss": 0.7743, "lr": 4.963318748070056e-06, "epoch": 0.5513513513513514, "percentage": 5.51, "elapsed_time": "0:05:26", "remaining_time": "1:33:22"} -{"current_steps": 103, "total_steps": 1850, "loss": 0.926, "lr": 4.9625906166986815e-06, "epoch": 0.5567567567567567, "percentage": 5.57, "elapsed_time": "0:05:33", "remaining_time": "1:34:16"} -{"current_steps": 104, "total_steps": 1850, "loss": 0.7037, "lr": 4.961855383854889e-06, "epoch": 0.5621621621621622, "percentage": 5.62, "elapsed_time": "0:05:36", "remaining_time": "1:34:08"} -{"current_steps": 105, "total_steps": 1850, "loss": 0.561, "lr": 4.961113051658901e-06, "epoch": 0.5675675675675675, "percentage": 5.68, "elapsed_time": "0:05:38", "remaining_time": "1:33:46"} -{"current_steps": 106, "total_steps": 1850, "loss": 0.7316, "lr": 4.96036362225141e-06, "epoch": 0.572972972972973, "percentage": 5.73, "elapsed_time": "0:05:40", "remaining_time": "1:33:30"} -{"current_steps": 107, "total_steps": 1850, "loss": 0.6426, "lr": 4.959607097793575e-06, "epoch": 0.5783783783783784, "percentage": 5.78, "elapsed_time": "0:05:43", "remaining_time": "1:33:14"} -{"current_steps": 108, "total_steps": 1850, "loss": 1.0044, "lr": 4.9588434804670176e-06, "epoch": 0.5837837837837838, "percentage": 5.84, "elapsed_time": "0:05:50", "remaining_time": "1:34:17"} -{"current_steps": 109, "total_steps": 1850, "loss": 0.9219, "lr": 4.958072772473812e-06, "epoch": 0.5891891891891892, "percentage": 5.89, "elapsed_time": "0:05:54", "remaining_time": "1:34:14"} -{"current_steps": 110, "total_steps": 1850, "loss": 0.6056, "lr": 4.9572949760364795e-06, "epoch": 0.5945945945945946, "percentage": 5.95, "elapsed_time": "0:05:54", "remaining_time": "1:33:33"} -{"current_steps": 111, "total_steps": 1850, "loss": 0.6346, "lr": 4.9565100933979835e-06, "epoch": 0.6, "percentage": 6.0, "elapsed_time": "0:05:56", "remaining_time": "1:33:08"} -{"current_steps": 112, "total_steps": 1850, "loss": 0.9856, "lr": 4.9557181268217225e-06, "epoch": 0.6054054054054054, "percentage": 6.05, "elapsed_time": "0:05:58", "remaining_time": "1:32:46"} -{"current_steps": 113, "total_steps": 1850, "loss": 0.8669, "lr": 4.954919078591521e-06, "epoch": 0.6108108108108108, "percentage": 6.11, "elapsed_time": "0:06:00", "remaining_time": "1:32:27"} -{"current_steps": 114, "total_steps": 1850, "loss": 0.7201, "lr": 4.954112951011628e-06, "epoch": 0.6162162162162163, "percentage": 6.16, "elapsed_time": "0:06:04", "remaining_time": "1:32:28"} -{"current_steps": 115, "total_steps": 1850, "loss": 0.9095, "lr": 4.9532997464067065e-06, "epoch": 0.6216216216216216, "percentage": 6.22, "elapsed_time": "0:06:06", "remaining_time": "1:32:08"} -{"current_steps": 116, "total_steps": 1850, "loss": 1.0213, "lr": 4.952479467121828e-06, "epoch": 0.6270270270270271, "percentage": 6.27, "elapsed_time": "0:06:08", "remaining_time": "1:31:47"} -{"current_steps": 117, "total_steps": 1850, "loss": 1.1154, "lr": 4.951652115522463e-06, "epoch": 0.6324324324324324, "percentage": 6.32, "elapsed_time": "0:06:10", "remaining_time": "1:31:24"} -{"current_steps": 118, "total_steps": 1850, "loss": 0.691, "lr": 4.950817693994481e-06, "epoch": 0.6378378378378379, "percentage": 6.38, "elapsed_time": "0:06:13", "remaining_time": "1:31:29"} -{"current_steps": 119, "total_steps": 1850, "loss": 0.7224, "lr": 4.949976204944135e-06, "epoch": 0.6432432432432432, "percentage": 6.43, "elapsed_time": "0:06:17", "remaining_time": "1:31:24"} -{"current_steps": 120, "total_steps": 1850, "loss": 0.9256, "lr": 4.949127650798063e-06, "epoch": 0.6486486486486487, "percentage": 6.49, "elapsed_time": "0:06:18", "remaining_time": "1:30:49"} -{"current_steps": 121, "total_steps": 1850, "loss": 0.6892, "lr": 4.948272034003275e-06, "epoch": 0.654054054054054, "percentage": 6.54, "elapsed_time": "0:06:18", "remaining_time": "1:30:15"} -{"current_steps": 122, "total_steps": 1850, "loss": 0.5878, "lr": 4.947409357027148e-06, "epoch": 0.6594594594594595, "percentage": 6.59, "elapsed_time": "0:06:20", "remaining_time": "1:29:48"} -{"current_steps": 123, "total_steps": 1850, "loss": 0.9904, "lr": 4.9465396223574165e-06, "epoch": 0.6648648648648648, "percentage": 6.65, "elapsed_time": "0:06:25", "remaining_time": "1:30:08"} -{"current_steps": 124, "total_steps": 1850, "loss": 1.1592, "lr": 4.945662832502172e-06, "epoch": 0.6702702702702703, "percentage": 6.7, "elapsed_time": "0:06:31", "remaining_time": "1:30:51"} -{"current_steps": 125, "total_steps": 1850, "loss": 1.0041, "lr": 4.944778989989847e-06, "epoch": 0.6756756756756757, "percentage": 6.76, "elapsed_time": "0:06:36", "remaining_time": "1:31:07"} -{"current_steps": 126, "total_steps": 1850, "loss": 0.7045, "lr": 4.943888097369216e-06, "epoch": 0.6810810810810811, "percentage": 6.81, "elapsed_time": "0:06:39", "remaining_time": "1:31:09"} -{"current_steps": 127, "total_steps": 1850, "loss": 0.6685, "lr": 4.942990157209381e-06, "epoch": 0.6864864864864865, "percentage": 6.86, "elapsed_time": "0:06:41", "remaining_time": "1:30:48"} -{"current_steps": 128, "total_steps": 1850, "loss": 0.8812, "lr": 4.9420851720997674e-06, "epoch": 0.6918918918918919, "percentage": 6.92, "elapsed_time": "0:06:43", "remaining_time": "1:30:30"} -{"current_steps": 129, "total_steps": 1850, "loss": 1.3014, "lr": 4.94117314465012e-06, "epoch": 0.6972972972972973, "percentage": 6.97, "elapsed_time": "0:06:45", "remaining_time": "1:30:10"} -{"current_steps": 130, "total_steps": 1850, "loss": 0.6978, "lr": 4.940254077490487e-06, "epoch": 0.7027027027027027, "percentage": 7.03, "elapsed_time": "0:06:48", "remaining_time": "1:30:10"} -{"current_steps": 131, "total_steps": 1850, "loss": 0.6249, "lr": 4.939327973271222e-06, "epoch": 0.7081081081081081, "percentage": 7.08, "elapsed_time": "0:06:50", "remaining_time": "1:29:47"} -{"current_steps": 132, "total_steps": 1850, "loss": 0.6423, "lr": 4.9383948346629665e-06, "epoch": 0.7135135135135136, "percentage": 7.14, "elapsed_time": "0:06:51", "remaining_time": "1:29:21"} -{"current_steps": 133, "total_steps": 1850, "loss": 0.7193, "lr": 4.937454664356652e-06, "epoch": 0.7189189189189189, "percentage": 7.19, "elapsed_time": "0:06:53", "remaining_time": "1:29:02"} -{"current_steps": 134, "total_steps": 1850, "loss": 0.7065, "lr": 4.9365074650634855e-06, "epoch": 0.7243243243243244, "percentage": 7.24, "elapsed_time": "0:06:56", "remaining_time": "1:28:53"} -{"current_steps": 135, "total_steps": 1850, "loss": 1.0046, "lr": 4.9355532395149445e-06, "epoch": 0.7297297297297297, "percentage": 7.3, "elapsed_time": "0:06:59", "remaining_time": "1:28:43"} -{"current_steps": 136, "total_steps": 1850, "loss": 0.6771, "lr": 4.9345919904627655e-06, "epoch": 0.7351351351351352, "percentage": 7.35, "elapsed_time": "0:07:03", "remaining_time": "1:28:52"} -{"current_steps": 137, "total_steps": 1850, "loss": 0.6589, "lr": 4.933623720678944e-06, "epoch": 0.7405405405405405, "percentage": 7.41, "elapsed_time": "0:07:06", "remaining_time": "1:28:56"} -{"current_steps": 138, "total_steps": 1850, "loss": 0.8755, "lr": 4.932648432955718e-06, "epoch": 0.745945945945946, "percentage": 7.46, "elapsed_time": "0:07:10", "remaining_time": "1:28:55"} -{"current_steps": 139, "total_steps": 1850, "loss": 0.6685, "lr": 4.931666130105564e-06, "epoch": 0.7513513513513513, "percentage": 7.51, "elapsed_time": "0:07:13", "remaining_time": "1:28:59"} -{"current_steps": 140, "total_steps": 1850, "loss": 0.8101, "lr": 4.930676814961189e-06, "epoch": 0.7567567567567568, "percentage": 7.57, "elapsed_time": "0:07:15", "remaining_time": "1:28:35"} -{"current_steps": 141, "total_steps": 1850, "loss": 0.8193, "lr": 4.92968049037552e-06, "epoch": 0.7621621621621621, "percentage": 7.62, "elapsed_time": "0:07:18", "remaining_time": "1:28:35"} -{"current_steps": 142, "total_steps": 1850, "loss": 0.7852, "lr": 4.9286771592217005e-06, "epoch": 0.7675675675675676, "percentage": 7.68, "elapsed_time": "0:07:20", "remaining_time": "1:28:15"} -{"current_steps": 143, "total_steps": 1850, "loss": 1.0388, "lr": 4.927666824393076e-06, "epoch": 0.772972972972973, "percentage": 7.73, "elapsed_time": "0:07:23", "remaining_time": "1:28:14"} -{"current_steps": 144, "total_steps": 1850, "loss": 0.8266, "lr": 4.926649488803191e-06, "epoch": 0.7783783783783784, "percentage": 7.78, "elapsed_time": "0:07:27", "remaining_time": "1:28:20"} -{"current_steps": 145, "total_steps": 1850, "loss": 0.4895, "lr": 4.925625155385776e-06, "epoch": 0.7837837837837838, "percentage": 7.84, "elapsed_time": "0:07:30", "remaining_time": "1:28:13"} -{"current_steps": 146, "total_steps": 1850, "loss": 0.8759, "lr": 4.924593827094743e-06, "epoch": 0.7891891891891892, "percentage": 7.89, "elapsed_time": "0:07:31", "remaining_time": "1:27:52"} -{"current_steps": 147, "total_steps": 1850, "loss": 0.701, "lr": 4.923555506904176e-06, "epoch": 0.7945945945945946, "percentage": 7.95, "elapsed_time": "0:07:34", "remaining_time": "1:27:42"} -{"current_steps": 148, "total_steps": 1850, "loss": 1.1327, "lr": 4.922510197808321e-06, "epoch": 0.8, "percentage": 8.0, "elapsed_time": "0:07:36", "remaining_time": "1:27:29"} -{"current_steps": 149, "total_steps": 1850, "loss": 0.7587, "lr": 4.921457902821578e-06, "epoch": 0.8054054054054054, "percentage": 8.05, "elapsed_time": "0:07:41", "remaining_time": "1:27:44"} -{"current_steps": 150, "total_steps": 1850, "loss": 1.2158, "lr": 4.920398624978493e-06, "epoch": 0.8108108108108109, "percentage": 8.11, "elapsed_time": "0:07:43", "remaining_time": "1:27:33"} -{"current_steps": 151, "total_steps": 1850, "loss": 0.6852, "lr": 4.919332367333748e-06, "epoch": 0.8162162162162162, "percentage": 8.16, "elapsed_time": "0:07:46", "remaining_time": "1:27:32"} -{"current_steps": 152, "total_steps": 1850, "loss": 0.6611, "lr": 4.918259132962154e-06, "epoch": 0.8216216216216217, "percentage": 8.22, "elapsed_time": "0:07:49", "remaining_time": "1:27:19"} -{"current_steps": 153, "total_steps": 1850, "loss": 0.7327, "lr": 4.917178924958638e-06, "epoch": 0.827027027027027, "percentage": 8.27, "elapsed_time": "0:07:50", "remaining_time": "1:26:58"} -{"current_steps": 154, "total_steps": 1850, "loss": 0.8528, "lr": 4.916091746438243e-06, "epoch": 0.8324324324324325, "percentage": 8.32, "elapsed_time": "0:07:51", "remaining_time": "1:26:36"} -{"current_steps": 155, "total_steps": 1850, "loss": 0.9141, "lr": 4.9149976005361085e-06, "epoch": 0.8378378378378378, "percentage": 8.38, "elapsed_time": "0:07:55", "remaining_time": "1:26:35"} -{"current_steps": 156, "total_steps": 1850, "loss": 1.1132, "lr": 4.913896490407467e-06, "epoch": 0.8432432432432433, "percentage": 8.43, "elapsed_time": "0:07:57", "remaining_time": "1:26:27"} -{"current_steps": 157, "total_steps": 1850, "loss": 0.7587, "lr": 4.912788419227635e-06, "epoch": 0.8486486486486486, "percentage": 8.49, "elapsed_time": "0:08:01", "remaining_time": "1:26:29"} -{"current_steps": 158, "total_steps": 1850, "loss": 0.9227, "lr": 4.911673390192002e-06, "epoch": 0.8540540540540541, "percentage": 8.54, "elapsed_time": "0:08:03", "remaining_time": "1:26:16"} -{"current_steps": 159, "total_steps": 1850, "loss": 0.8154, "lr": 4.910551406516023e-06, "epoch": 0.8594594594594595, "percentage": 8.59, "elapsed_time": "0:08:07", "remaining_time": "1:26:23"} -{"current_steps": 160, "total_steps": 1850, "loss": 0.9897, "lr": 4.909422471435207e-06, "epoch": 0.8648648648648649, "percentage": 8.65, "elapsed_time": "0:08:09", "remaining_time": "1:26:14"} -{"current_steps": 161, "total_steps": 1850, "loss": 0.6162, "lr": 4.90828658820511e-06, "epoch": 0.8702702702702703, "percentage": 8.7, "elapsed_time": "0:08:11", "remaining_time": "1:25:52"} -{"current_steps": 162, "total_steps": 1850, "loss": 0.5734, "lr": 4.907143760101325e-06, "epoch": 0.8756756756756757, "percentage": 8.76, "elapsed_time": "0:08:12", "remaining_time": "1:25:31"} -{"current_steps": 163, "total_steps": 1850, "loss": 0.8328, "lr": 4.905993990419472e-06, "epoch": 0.8810810810810811, "percentage": 8.81, "elapsed_time": "0:08:19", "remaining_time": "1:26:09"} -{"current_steps": 164, "total_steps": 1850, "loss": 0.6787, "lr": 4.904837282475187e-06, "epoch": 0.8864864864864865, "percentage": 8.86, "elapsed_time": "0:08:21", "remaining_time": "1:25:51"} -{"current_steps": 165, "total_steps": 1850, "loss": 0.9658, "lr": 4.9036736396041165e-06, "epoch": 0.8918918918918919, "percentage": 8.92, "elapsed_time": "0:08:24", "remaining_time": "1:25:49"} -{"current_steps": 166, "total_steps": 1850, "loss": 0.7899, "lr": 4.902503065161905e-06, "epoch": 0.8972972972972973, "percentage": 8.97, "elapsed_time": "0:08:30", "remaining_time": "1:26:16"} -{"current_steps": 167, "total_steps": 1850, "loss": 0.9476, "lr": 4.901325562524185e-06, "epoch": 0.9027027027027027, "percentage": 9.03, "elapsed_time": "0:08:34", "remaining_time": "1:26:20"} -{"current_steps": 168, "total_steps": 1850, "loss": 0.7589, "lr": 4.900141135086569e-06, "epoch": 0.9081081081081082, "percentage": 9.08, "elapsed_time": "0:08:41", "remaining_time": "1:26:57"} -{"current_steps": 169, "total_steps": 1850, "loss": 0.6724, "lr": 4.898949786264638e-06, "epoch": 0.9135135135135135, "percentage": 9.14, "elapsed_time": "0:08:43", "remaining_time": "1:26:49"} -{"current_steps": 170, "total_steps": 1850, "loss": 0.6968, "lr": 4.897751519493933e-06, "epoch": 0.918918918918919, "percentage": 9.19, "elapsed_time": "0:08:47", "remaining_time": "1:26:57"} -{"current_steps": 171, "total_steps": 1850, "loss": 0.7984, "lr": 4.896546338229945e-06, "epoch": 0.9243243243243243, "percentage": 9.24, "elapsed_time": "0:08:50", "remaining_time": "1:26:46"} -{"current_steps": 172, "total_steps": 1850, "loss": 0.6109, "lr": 4.8953342459481034e-06, "epoch": 0.9297297297297298, "percentage": 9.3, "elapsed_time": "0:08:53", "remaining_time": "1:26:47"} -{"current_steps": 173, "total_steps": 1850, "loss": 0.8126, "lr": 4.894115246143768e-06, "epoch": 0.9351351351351351, "percentage": 9.35, "elapsed_time": "0:08:57", "remaining_time": "1:26:51"} -{"current_steps": 174, "total_steps": 1850, "loss": 0.6862, "lr": 4.892889342332218e-06, "epoch": 0.9405405405405406, "percentage": 9.41, "elapsed_time": "0:08:59", "remaining_time": "1:26:40"} -{"current_steps": 175, "total_steps": 1850, "loss": 0.9895, "lr": 4.891656538048642e-06, "epoch": 0.9459459459459459, "percentage": 9.46, "elapsed_time": "0:09:06", "remaining_time": "1:27:09"} -{"current_steps": 176, "total_steps": 1850, "loss": 0.8481, "lr": 4.890416836848128e-06, "epoch": 0.9513513513513514, "percentage": 9.51, "elapsed_time": "0:09:08", "remaining_time": "1:27:01"} -{"current_steps": 177, "total_steps": 1850, "loss": 0.6478, "lr": 4.889170242305652e-06, "epoch": 0.9567567567567568, "percentage": 9.57, "elapsed_time": "0:09:10", "remaining_time": "1:26:42"} -{"current_steps": 178, "total_steps": 1850, "loss": 0.9714, "lr": 4.887916758016069e-06, "epoch": 0.9621621621621622, "percentage": 9.62, "elapsed_time": "0:09:13", "remaining_time": "1:26:39"} -{"current_steps": 179, "total_steps": 1850, "loss": 1.1264, "lr": 4.886656387594104e-06, "epoch": 0.9675675675675676, "percentage": 9.68, "elapsed_time": "0:09:17", "remaining_time": "1:26:42"} -{"current_steps": 180, "total_steps": 1850, "loss": 0.7664, "lr": 4.885389134674338e-06, "epoch": 0.972972972972973, "percentage": 9.73, "elapsed_time": "0:09:21", "remaining_time": "1:26:50"} -{"current_steps": 181, "total_steps": 1850, "loss": 0.6131, "lr": 4.884115002911197e-06, "epoch": 0.9783783783783784, "percentage": 9.78, "elapsed_time": "0:09:23", "remaining_time": "1:26:32"} -{"current_steps": 182, "total_steps": 1850, "loss": 0.8733, "lr": 4.88283399597895e-06, "epoch": 0.9837837837837838, "percentage": 9.84, "elapsed_time": "0:09:27", "remaining_time": "1:26:45"} -{"current_steps": 183, "total_steps": 1850, "loss": 0.643, "lr": 4.881546117571686e-06, "epoch": 0.9891891891891892, "percentage": 9.89, "elapsed_time": "0:09:30", "remaining_time": "1:26:36"} -{"current_steps": 184, "total_steps": 1850, "loss": 0.7287, "lr": 4.8802513714033135e-06, "epoch": 0.9945945945945946, "percentage": 9.95, "elapsed_time": "0:09:35", "remaining_time": "1:26:53"} -{"current_steps": 185, "total_steps": 1850, "loss": 0.9927, "lr": 4.878949761207545e-06, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "0:09:37", "remaining_time": "1:26:37"} -{"current_steps": 186, "total_steps": 1850, "loss": 0.66, "lr": 4.8776412907378845e-06, "epoch": 1.0054054054054054, "percentage": 10.05, "elapsed_time": "1:13:09", "remaining_time": "10:54:31"} -{"current_steps": 187, "total_steps": 1850, "loss": 0.594, "lr": 4.876325963767623e-06, "epoch": 1.0108108108108107, "percentage": 10.11, "elapsed_time": "1:13:11", "remaining_time": "10:50:52"} -{"current_steps": 188, "total_steps": 1850, "loss": 0.5825, "lr": 4.875003784089822e-06, "epoch": 1.0162162162162163, "percentage": 10.16, "elapsed_time": "1:13:15", "remaining_time": "10:47:35"} -{"current_steps": 189, "total_steps": 1850, "loss": 0.6594, "lr": 4.873674755517305e-06, "epoch": 1.0216216216216216, "percentage": 10.22, "elapsed_time": "1:13:17", "remaining_time": "10:44:06"} -{"current_steps": 190, "total_steps": 1850, "loss": 0.7536, "lr": 4.872338881882645e-06, "epoch": 1.027027027027027, "percentage": 10.27, "elapsed_time": "1:13:20", "remaining_time": "10:40:49"} -{"current_steps": 191, "total_steps": 1850, "loss": 0.4849, "lr": 4.870996167038154e-06, "epoch": 1.0324324324324325, "percentage": 10.32, "elapsed_time": "1:13:24", "remaining_time": "10:37:37"} -{"current_steps": 192, "total_steps": 1850, "loss": 0.3771, "lr": 4.869646614855877e-06, "epoch": 1.037837837837838, "percentage": 10.38, "elapsed_time": "1:13:26", "remaining_time": "10:34:12"} -{"current_steps": 193, "total_steps": 1850, "loss": 0.8545, "lr": 4.868290229227567e-06, "epoch": 1.0432432432432432, "percentage": 10.43, "elapsed_time": "1:13:33", "remaining_time": "10:31:28"} -{"current_steps": 194, "total_steps": 1850, "loss": 0.3698, "lr": 4.866927014064692e-06, "epoch": 1.0486486486486486, "percentage": 10.49, "elapsed_time": "1:13:34", "remaining_time": "10:28:05"} -{"current_steps": 195, "total_steps": 1850, "loss": 0.8468, "lr": 4.86555697329841e-06, "epoch": 1.054054054054054, "percentage": 10.54, "elapsed_time": "1:13:37", "remaining_time": "10:24:55"} -{"current_steps": 196, "total_steps": 1850, "loss": 0.8232, "lr": 4.864180110879562e-06, "epoch": 1.0594594594594595, "percentage": 10.59, "elapsed_time": "1:13:44", "remaining_time": "10:22:19"} -{"current_steps": 197, "total_steps": 1850, "loss": 0.4097, "lr": 4.862796430778663e-06, "epoch": 1.0648648648648649, "percentage": 10.65, "elapsed_time": "1:13:46", "remaining_time": "10:18:59"} -{"current_steps": 198, "total_steps": 1850, "loss": 0.6746, "lr": 4.861405936985889e-06, "epoch": 1.0702702702702702, "percentage": 10.7, "elapsed_time": "1:13:50", "remaining_time": "10:16:08"} -{"current_steps": 199, "total_steps": 1850, "loss": 0.6605, "lr": 4.860008633511059e-06, "epoch": 1.0756756756756758, "percentage": 10.76, "elapsed_time": "1:13:55", "remaining_time": "10:13:19"} -{"current_steps": 200, "total_steps": 1850, "loss": 0.471, "lr": 4.8586045243836384e-06, "epoch": 1.0810810810810811, "percentage": 10.81, "elapsed_time": "1:13:59", "remaining_time": "10:10:26"} -{"current_steps": 201, "total_steps": 1850, "loss": 0.7665, "lr": 4.857193613652711e-06, "epoch": 1.0864864864864865, "percentage": 10.86, "elapsed_time": "1:14:06", "remaining_time": "10:07:56"} -{"current_steps": 202, "total_steps": 1850, "loss": 0.6436, "lr": 4.8557759053869775e-06, "epoch": 1.0918918918918918, "percentage": 10.92, "elapsed_time": "1:14:07", "remaining_time": "10:04:41"} -{"current_steps": 203, "total_steps": 1850, "loss": 0.4642, "lr": 4.854351403674741e-06, "epoch": 1.0972972972972972, "percentage": 10.97, "elapsed_time": "1:14:09", "remaining_time": "10:01:36"} -{"current_steps": 204, "total_steps": 1850, "loss": 0.5737, "lr": 4.852920112623895e-06, "epoch": 1.1027027027027028, "percentage": 11.03, "elapsed_time": "1:14:10", "remaining_time": "9:58:28"} -{"current_steps": 205, "total_steps": 1850, "loss": 0.7302, "lr": 4.851482036361912e-06, "epoch": 1.1081081081081081, "percentage": 11.08, "elapsed_time": "1:14:11", "remaining_time": "9:55:19"} -{"current_steps": 206, "total_steps": 1850, "loss": 0.5229, "lr": 4.850037179035829e-06, "epoch": 1.1135135135135135, "percentage": 11.14, "elapsed_time": "1:14:13", "remaining_time": "9:52:21"} -{"current_steps": 207, "total_steps": 1850, "loss": 0.5529, "lr": 4.8485855448122425e-06, "epoch": 1.118918918918919, "percentage": 11.19, "elapsed_time": "1:14:15", "remaining_time": "9:49:24"} -{"current_steps": 208, "total_steps": 1850, "loss": 0.3635, "lr": 4.847127137877286e-06, "epoch": 1.1243243243243244, "percentage": 11.24, "elapsed_time": "1:14:17", "remaining_time": "9:46:27"} -{"current_steps": 209, "total_steps": 1850, "loss": 0.8149, "lr": 4.8456619624366285e-06, "epoch": 1.1297297297297297, "percentage": 11.3, "elapsed_time": "1:14:20", "remaining_time": "9:43:39"} -{"current_steps": 210, "total_steps": 1850, "loss": 0.8333, "lr": 4.844190022715456e-06, "epoch": 1.135135135135135, "percentage": 11.35, "elapsed_time": "1:14:23", "remaining_time": "9:40:55"} -{"current_steps": 211, "total_steps": 1850, "loss": 0.3717, "lr": 4.84271132295846e-06, "epoch": 1.1405405405405404, "percentage": 11.41, "elapsed_time": "1:14:26", "remaining_time": "9:38:16"} -{"current_steps": 212, "total_steps": 1850, "loss": 0.5994, "lr": 4.841225867429826e-06, "epoch": 1.145945945945946, "percentage": 11.46, "elapsed_time": "1:14:28", "remaining_time": "9:35:28"} -{"current_steps": 213, "total_steps": 1850, "loss": 0.8382, "lr": 4.839733660413224e-06, "epoch": 1.1513513513513514, "percentage": 11.51, "elapsed_time": "1:14:31", "remaining_time": "9:32:48"} -{"current_steps": 214, "total_steps": 1850, "loss": 0.818, "lr": 4.838234706211792e-06, "epoch": 1.1567567567567567, "percentage": 11.57, "elapsed_time": "1:14:35", "remaining_time": "9:30:11"} -{"current_steps": 215, "total_steps": 1850, "loss": 0.4267, "lr": 4.836729009148124e-06, "epoch": 1.1621621621621623, "percentage": 11.62, "elapsed_time": "1:14:38", "remaining_time": "9:27:37"} -{"current_steps": 216, "total_steps": 1850, "loss": 0.3472, "lr": 4.835216573564261e-06, "epoch": 1.1675675675675676, "percentage": 11.68, "elapsed_time": "1:14:41", "remaining_time": "9:25:04"} -{"current_steps": 217, "total_steps": 1850, "loss": 0.6323, "lr": 4.833697403821672e-06, "epoch": 1.172972972972973, "percentage": 11.73, "elapsed_time": "1:14:46", "remaining_time": "9:22:40"} -{"current_steps": 218, "total_steps": 1850, "loss": 0.6831, "lr": 4.8321715043012516e-06, "epoch": 1.1783783783783783, "percentage": 11.78, "elapsed_time": "1:14:49", "remaining_time": "9:20:08"} -{"current_steps": 219, "total_steps": 1850, "loss": 0.3682, "lr": 4.830638879403296e-06, "epoch": 1.1837837837837837, "percentage": 11.84, "elapsed_time": "1:14:51", "remaining_time": "9:17:26"} -{"current_steps": 220, "total_steps": 1850, "loss": 0.4154, "lr": 4.8290995335475e-06, "epoch": 1.1891891891891893, "percentage": 11.89, "elapsed_time": "1:14:53", "remaining_time": "9:14:51"} -{"current_steps": 221, "total_steps": 1850, "loss": 0.3991, "lr": 4.827553471172935e-06, "epoch": 1.1945945945945946, "percentage": 11.95, "elapsed_time": "1:14:54", "remaining_time": "9:12:11"} -{"current_steps": 222, "total_steps": 1850, "loss": 0.4538, "lr": 4.826000696738045e-06, "epoch": 1.2, "percentage": 12.0, "elapsed_time": "1:14:57", "remaining_time": "9:09:38"} -{"current_steps": 223, "total_steps": 1850, "loss": 0.7692, "lr": 4.824441214720629e-06, "epoch": 1.2054054054054055, "percentage": 12.05, "elapsed_time": "1:14:59", "remaining_time": "9:07:07"} -{"current_steps": 224, "total_steps": 1850, "loss": 0.6038, "lr": 4.8228750296178275e-06, "epoch": 1.2108108108108109, "percentage": 12.11, "elapsed_time": "1:15:02", "remaining_time": "9:04:44"} -{"current_steps": 225, "total_steps": 1850, "loss": 0.4147, "lr": 4.821302145946113e-06, "epoch": 1.2162162162162162, "percentage": 12.16, "elapsed_time": "1:15:05", "remaining_time": "9:02:16"} -{"current_steps": 226, "total_steps": 1850, "loss": 0.5398, "lr": 4.819722568241274e-06, "epoch": 1.2216216216216216, "percentage": 12.22, "elapsed_time": "1:15:07", "remaining_time": "8:59:53"} -{"current_steps": 227, "total_steps": 1850, "loss": 0.3864, "lr": 4.818136301058401e-06, "epoch": 1.227027027027027, "percentage": 12.27, "elapsed_time": "1:15:11", "remaining_time": "8:57:37"} -{"current_steps": 228, "total_steps": 1850, "loss": 0.5712, "lr": 4.816543348971879e-06, "epoch": 1.2324324324324325, "percentage": 12.32, "elapsed_time": "1:15:15", "remaining_time": "8:55:21"} -{"current_steps": 229, "total_steps": 1850, "loss": 0.662, "lr": 4.814943716575368e-06, "epoch": 1.2378378378378379, "percentage": 12.38, "elapsed_time": "1:15:16", "remaining_time": "8:52:49"} -{"current_steps": 230, "total_steps": 1850, "loss": 0.8661, "lr": 4.813337408481793e-06, "epoch": 1.2432432432432432, "percentage": 12.43, "elapsed_time": "1:15:19", "remaining_time": "8:50:32"} -{"current_steps": 231, "total_steps": 1850, "loss": 0.9218, "lr": 4.811724429323329e-06, "epoch": 1.2486486486486488, "percentage": 12.49, "elapsed_time": "1:15:21", "remaining_time": "8:48:08"} -{"current_steps": 232, "total_steps": 1850, "loss": 0.5597, "lr": 4.810104783751389e-06, "epoch": 1.2540540540540541, "percentage": 12.54, "elapsed_time": "1:15:22", "remaining_time": "8:45:43"} -{"current_steps": 233, "total_steps": 1850, "loss": 0.4786, "lr": 4.8084784764366125e-06, "epoch": 1.2594594594594595, "percentage": 12.59, "elapsed_time": "1:15:24", "remaining_time": "8:43:17"} -{"current_steps": 234, "total_steps": 1850, "loss": 0.5219, "lr": 4.806845512068846e-06, "epoch": 1.2648648648648648, "percentage": 12.65, "elapsed_time": "1:15:28", "remaining_time": "8:41:15"} -{"current_steps": 235, "total_steps": 1850, "loss": 0.643, "lr": 4.805205895357137e-06, "epoch": 1.2702702702702702, "percentage": 12.7, "elapsed_time": "1:15:29", "remaining_time": "8:38:50"} -{"current_steps": 236, "total_steps": 1850, "loss": 0.5858, "lr": 4.803559631029713e-06, "epoch": 1.2756756756756757, "percentage": 12.76, "elapsed_time": "1:15:32", "remaining_time": "8:36:35"} -{"current_steps": 237, "total_steps": 1850, "loss": 0.4185, "lr": 4.801906723833973e-06, "epoch": 1.281081081081081, "percentage": 12.81, "elapsed_time": "1:15:35", "remaining_time": "8:34:25"} -{"current_steps": 238, "total_steps": 1850, "loss": 0.4917, "lr": 4.8002471785364734e-06, "epoch": 1.2864864864864864, "percentage": 12.86, "elapsed_time": "1:15:36", "remaining_time": "8:32:08"} -{"current_steps": 239, "total_steps": 1850, "loss": 0.645, "lr": 4.798580999922913e-06, "epoch": 1.291891891891892, "percentage": 12.92, "elapsed_time": "1:15:38", "remaining_time": "8:29:52"} -{"current_steps": 240, "total_steps": 1850, "loss": 0.5378, "lr": 4.796908192798117e-06, "epoch": 1.2972972972972974, "percentage": 12.97, "elapsed_time": "1:15:39", "remaining_time": "8:27:31"} -{"current_steps": 241, "total_steps": 1850, "loss": 0.5197, "lr": 4.7952287619860276e-06, "epoch": 1.3027027027027027, "percentage": 13.03, "elapsed_time": "1:15:42", "remaining_time": "8:25:24"} -{"current_steps": 242, "total_steps": 1850, "loss": 1.0226, "lr": 4.793542712329689e-06, "epoch": 1.308108108108108, "percentage": 13.08, "elapsed_time": "1:15:45", "remaining_time": "8:23:21"} -{"current_steps": 243, "total_steps": 1850, "loss": 0.5502, "lr": 4.791850048691228e-06, "epoch": 1.3135135135135134, "percentage": 13.14, "elapsed_time": "1:15:49", "remaining_time": "8:21:27"} -{"current_steps": 244, "total_steps": 1850, "loss": 0.6976, "lr": 4.79015077595185e-06, "epoch": 1.318918918918919, "percentage": 13.19, "elapsed_time": "1:15:56", "remaining_time": "8:19:47"} -{"current_steps": 245, "total_steps": 1850, "loss": 0.4795, "lr": 4.788444899011816e-06, "epoch": 1.3243243243243243, "percentage": 13.24, "elapsed_time": "1:15:58", "remaining_time": "8:17:44"} -{"current_steps": 246, "total_steps": 1850, "loss": 0.6526, "lr": 4.786732422790432e-06, "epoch": 1.3297297297297297, "percentage": 13.3, "elapsed_time": "1:16:02", "remaining_time": "8:15:46"} -{"current_steps": 247, "total_steps": 1850, "loss": 0.5551, "lr": 4.785013352226036e-06, "epoch": 1.3351351351351353, "percentage": 13.35, "elapsed_time": "1:16:03", "remaining_time": "8:13:34"} -{"current_steps": 248, "total_steps": 1850, "loss": 0.3151, "lr": 4.7832876922759805e-06, "epoch": 1.3405405405405406, "percentage": 13.41, "elapsed_time": "1:16:05", "remaining_time": "8:11:31"} -{"current_steps": 249, "total_steps": 1850, "loss": 0.6713, "lr": 4.781555447916622e-06, "epoch": 1.345945945945946, "percentage": 13.46, "elapsed_time": "1:16:08", "remaining_time": "8:09:31"} -{"current_steps": 250, "total_steps": 1850, "loss": 0.437, "lr": 4.779816624143302e-06, "epoch": 1.3513513513513513, "percentage": 13.51, "elapsed_time": "1:16:09", "remaining_time": "8:07:25"} -{"current_steps": 251, "total_steps": 1850, "loss": 0.7632, "lr": 4.77807122597034e-06, "epoch": 1.3567567567567567, "percentage": 13.57, "elapsed_time": "1:16:13", "remaining_time": "8:05:34"} -{"current_steps": 252, "total_steps": 1850, "loss": 0.4894, "lr": 4.776319258431009e-06, "epoch": 1.3621621621621622, "percentage": 13.62, "elapsed_time": "1:16:17", "remaining_time": "8:03:45"} -{"current_steps": 253, "total_steps": 1850, "loss": 0.4456, "lr": 4.77456072657753e-06, "epoch": 1.3675675675675676, "percentage": 13.68, "elapsed_time": "1:16:19", "remaining_time": "8:01:46"} -{"current_steps": 254, "total_steps": 1850, "loss": 0.5381, "lr": 4.772795635481053e-06, "epoch": 1.372972972972973, "percentage": 13.73, "elapsed_time": "1:16:21", "remaining_time": "7:59:45"} -{"current_steps": 255, "total_steps": 1850, "loss": 1.0302, "lr": 4.77102399023164e-06, "epoch": 1.3783783783783785, "percentage": 13.78, "elapsed_time": "1:16:24", "remaining_time": "7:57:55"} -{"current_steps": 256, "total_steps": 1850, "loss": 0.4875, "lr": 4.769245795938261e-06, "epoch": 1.3837837837837839, "percentage": 13.84, "elapsed_time": "1:16:26", "remaining_time": "7:55:57"} -{"current_steps": 257, "total_steps": 1850, "loss": 0.4923, "lr": 4.767461057728763e-06, "epoch": 1.3891891891891892, "percentage": 13.89, "elapsed_time": "1:16:29", "remaining_time": "7:54:07"} -{"current_steps": 258, "total_steps": 1850, "loss": 0.6699, "lr": 4.76566978074987e-06, "epoch": 1.3945945945945946, "percentage": 13.95, "elapsed_time": "1:16:33", "remaining_time": "7:52:25"} -{"current_steps": 259, "total_steps": 1850, "loss": 0.6117, "lr": 4.7638719701671586e-06, "epoch": 1.4, "percentage": 14.0, "elapsed_time": "1:16:37", "remaining_time": "7:50:40"} -{"current_steps": 260, "total_steps": 1850, "loss": 0.8534, "lr": 4.762067631165049e-06, "epoch": 1.4054054054054055, "percentage": 14.05, "elapsed_time": "1:16:40", "remaining_time": "7:48:55"} -{"current_steps": 261, "total_steps": 1850, "loss": 0.5057, "lr": 4.760256768946787e-06, "epoch": 1.4108108108108108, "percentage": 14.11, "elapsed_time": "1:16:47", "remaining_time": "7:47:34"} -{"current_steps": 262, "total_steps": 1850, "loss": 0.7286, "lr": 4.758439388734429e-06, "epoch": 1.4162162162162162, "percentage": 14.16, "elapsed_time": "1:16:50", "remaining_time": "7:45:45"} -{"current_steps": 263, "total_steps": 1850, "loss": 0.9827, "lr": 4.7566154957688276e-06, "epoch": 1.4216216216216218, "percentage": 14.22, "elapsed_time": "1:16:52", "remaining_time": "7:43:50"} -{"current_steps": 264, "total_steps": 1850, "loss": 0.7042, "lr": 4.754785095309617e-06, "epoch": 1.427027027027027, "percentage": 14.27, "elapsed_time": "1:16:53", "remaining_time": "7:41:56"} -{"current_steps": 265, "total_steps": 1850, "loss": 0.5179, "lr": 4.752948192635199e-06, "epoch": 1.4324324324324325, "percentage": 14.32, "elapsed_time": "1:16:55", "remaining_time": "7:40:04"} -{"current_steps": 266, "total_steps": 1850, "loss": 0.8527, "lr": 4.751104793042722e-06, "epoch": 1.4378378378378378, "percentage": 14.38, "elapsed_time": "1:16:59", "remaining_time": "7:38:28"} -{"current_steps": 267, "total_steps": 1850, "loss": 0.5627, "lr": 4.7492549018480725e-06, "epoch": 1.4432432432432432, "percentage": 14.43, "elapsed_time": "1:17:02", "remaining_time": "7:36:46"} -{"current_steps": 268, "total_steps": 1850, "loss": 0.8981, "lr": 4.747398524385858e-06, "epoch": 1.4486486486486487, "percentage": 14.49, "elapsed_time": "1:17:05", "remaining_time": "7:35:01"} -{"current_steps": 269, "total_steps": 1850, "loss": 0.5455, "lr": 4.745535666009389e-06, "epoch": 1.454054054054054, "percentage": 14.54, "elapsed_time": "1:17:07", "remaining_time": "7:33:17"} -{"current_steps": 270, "total_steps": 1850, "loss": 0.4348, "lr": 4.743666332090664e-06, "epoch": 1.4594594594594594, "percentage": 14.59, "elapsed_time": "1:17:10", "remaining_time": "7:31:37"} -{"current_steps": 271, "total_steps": 1850, "loss": 0.5524, "lr": 4.74179052802036e-06, "epoch": 1.464864864864865, "percentage": 14.65, "elapsed_time": "1:17:13", "remaining_time": "7:29:54"} -{"current_steps": 272, "total_steps": 1850, "loss": 0.7469, "lr": 4.739908259207807e-06, "epoch": 1.4702702702702704, "percentage": 14.7, "elapsed_time": "1:17:14", "remaining_time": "7:28:09"} -{"current_steps": 273, "total_steps": 1850, "loss": 0.7216, "lr": 4.738019531080981e-06, "epoch": 1.4756756756756757, "percentage": 14.76, "elapsed_time": "1:17:18", "remaining_time": "7:26:32"} -{"current_steps": 274, "total_steps": 1850, "loss": 0.7527, "lr": 4.7361243490864825e-06, "epoch": 1.481081081081081, "percentage": 14.81, "elapsed_time": "1:17:23", "remaining_time": "7:25:10"} -{"current_steps": 275, "total_steps": 1850, "loss": 0.7437, "lr": 4.734222718689527e-06, "epoch": 1.4864864864864864, "percentage": 14.86, "elapsed_time": "1:17:29", "remaining_time": "7:23:47"} -{"current_steps": 276, "total_steps": 1850, "loss": 0.5187, "lr": 4.732314645373922e-06, "epoch": 1.491891891891892, "percentage": 14.92, "elapsed_time": "1:17:32", "remaining_time": "7:22:11"} -{"current_steps": 277, "total_steps": 1850, "loss": 0.7186, "lr": 4.730400134642055e-06, "epoch": 1.4972972972972973, "percentage": 14.97, "elapsed_time": "1:17:35", "remaining_time": "7:20:38"} -{"current_steps": 278, "total_steps": 1850, "loss": 0.9655, "lr": 4.728479192014879e-06, "epoch": 1.5027027027027027, "percentage": 15.03, "elapsed_time": "1:17:42", "remaining_time": "7:19:23"} -{"current_steps": 279, "total_steps": 1850, "loss": 0.6251, "lr": 4.726551823031895e-06, "epoch": 1.5081081081081082, "percentage": 15.08, "elapsed_time": "1:17:46", "remaining_time": "7:17:57"} -{"current_steps": 280, "total_steps": 1850, "loss": 0.4805, "lr": 4.7246180332511335e-06, "epoch": 1.5135135135135136, "percentage": 15.14, "elapsed_time": "1:17:49", "remaining_time": "7:16:19"} -{"current_steps": 281, "total_steps": 1850, "loss": 1.0939, "lr": 4.722677828249142e-06, "epoch": 1.518918918918919, "percentage": 15.19, "elapsed_time": "1:17:52", "remaining_time": "7:14:49"} -{"current_steps": 282, "total_steps": 1850, "loss": 0.9485, "lr": 4.720731213620972e-06, "epoch": 1.5243243243243243, "percentage": 15.24, "elapsed_time": "1:17:55", "remaining_time": "7:13:18"} -{"current_steps": 283, "total_steps": 1850, "loss": 0.5805, "lr": 4.718778194980152e-06, "epoch": 1.5297297297297296, "percentage": 15.3, "elapsed_time": "1:17:59", "remaining_time": "7:11:48"} -{"current_steps": 284, "total_steps": 1850, "loss": 0.77, "lr": 4.7168187779586805e-06, "epoch": 1.535135135135135, "percentage": 15.35, "elapsed_time": "1:18:02", "remaining_time": "7:10:18"} -{"current_steps": 285, "total_steps": 1850, "loss": 0.5932, "lr": 4.71485296820701e-06, "epoch": 1.5405405405405406, "percentage": 15.41, "elapsed_time": "1:18:04", "remaining_time": "7:08:41"} -{"current_steps": 286, "total_steps": 1850, "loss": 0.6296, "lr": 4.7128807713940245e-06, "epoch": 1.545945945945946, "percentage": 15.46, "elapsed_time": "1:18:09", "remaining_time": "7:07:26"} -{"current_steps": 287, "total_steps": 1850, "loss": 0.6201, "lr": 4.710902193207028e-06, "epoch": 1.5513513513513515, "percentage": 15.51, "elapsed_time": "1:18:15", "remaining_time": "7:06:12"} -{"current_steps": 288, "total_steps": 1850, "loss": 0.5682, "lr": 4.708917239351727e-06, "epoch": 1.5567567567567568, "percentage": 15.57, "elapsed_time": "1:18:22", "remaining_time": "7:05:06"} -{"current_steps": 289, "total_steps": 1850, "loss": 0.8877, "lr": 4.706925915552214e-06, "epoch": 1.5621621621621622, "percentage": 15.62, "elapsed_time": "1:18:24", "remaining_time": "7:03:29"} -{"current_steps": 290, "total_steps": 1850, "loss": 0.6521, "lr": 4.704928227550949e-06, "epoch": 1.5675675675675675, "percentage": 15.68, "elapsed_time": "1:18:28", "remaining_time": "7:02:07"} -{"current_steps": 291, "total_steps": 1850, "loss": 0.4929, "lr": 4.702924181108745e-06, "epoch": 1.572972972972973, "percentage": 15.73, "elapsed_time": "1:18:30", "remaining_time": "7:00:36"} -{"current_steps": 292, "total_steps": 1850, "loss": 0.4515, "lr": 4.700913782004755e-06, "epoch": 1.5783783783783782, "percentage": 15.78, "elapsed_time": "1:18:32", "remaining_time": "6:59:06"} -{"current_steps": 293, "total_steps": 1850, "loss": 0.5477, "lr": 4.698897036036446e-06, "epoch": 1.5837837837837838, "percentage": 15.84, "elapsed_time": "1:18:37", "remaining_time": "6:57:48"} -{"current_steps": 294, "total_steps": 1850, "loss": 0.9589, "lr": 4.696873949019591e-06, "epoch": 1.5891891891891892, "percentage": 15.89, "elapsed_time": "1:18:39", "remaining_time": "6:56:18"} -{"current_steps": 295, "total_steps": 1850, "loss": 0.4425, "lr": 4.694844526788248e-06, "epoch": 1.5945945945945947, "percentage": 15.95, "elapsed_time": "1:18:42", "remaining_time": "6:54:51"} -{"current_steps": 296, "total_steps": 1850, "loss": 0.4899, "lr": 4.692808775194745e-06, "epoch": 1.6, "percentage": 16.0, "elapsed_time": "1:18:48", "remaining_time": "6:53:43"} -{"current_steps": 297, "total_steps": 1850, "loss": 0.4884, "lr": 4.690766700109659e-06, "epoch": 1.6054054054054054, "percentage": 16.05, "elapsed_time": "1:18:53", "remaining_time": "6:52:29"} -{"current_steps": 298, "total_steps": 1850, "loss": 0.8977, "lr": 4.688718307421807e-06, "epoch": 1.6108108108108108, "percentage": 16.11, "elapsed_time": "1:18:55", "remaining_time": "6:51:02"} -{"current_steps": 299, "total_steps": 1850, "loss": 0.6833, "lr": 4.686663603038222e-06, "epoch": 1.6162162162162161, "percentage": 16.16, "elapsed_time": "1:18:56", "remaining_time": "6:49:29"} -{"current_steps": 300, "total_steps": 1850, "loss": 0.9141, "lr": 4.6846025928841365e-06, "epoch": 1.6216216216216215, "percentage": 16.22, "elapsed_time": "1:19:01", "remaining_time": "6:48:19"} -{"current_steps": 301, "total_steps": 1850, "loss": 0.5121, "lr": 4.6825352829029705e-06, "epoch": 1.627027027027027, "percentage": 16.27, "elapsed_time": "1:19:05", "remaining_time": "6:47:03"} -{"current_steps": 302, "total_steps": 1850, "loss": 0.5399, "lr": 4.68046167905631e-06, "epoch": 1.6324324324324324, "percentage": 16.32, "elapsed_time": "1:19:11", "remaining_time": "6:45:54"} -{"current_steps": 303, "total_steps": 1850, "loss": 0.7921, "lr": 4.678381787323889e-06, "epoch": 1.637837837837838, "percentage": 16.38, "elapsed_time": "1:19:15", "remaining_time": "6:44:42"} -{"current_steps": 304, "total_steps": 1850, "loss": 0.7178, "lr": 4.676295613703577e-06, "epoch": 1.6432432432432433, "percentage": 16.43, "elapsed_time": "1:19:19", "remaining_time": "6:43:26"} -{"current_steps": 305, "total_steps": 1850, "loss": 0.7162, "lr": 4.674203164211357e-06, "epoch": 1.6486486486486487, "percentage": 16.49, "elapsed_time": "1:19:22", "remaining_time": "6:42:02"} -{"current_steps": 306, "total_steps": 1850, "loss": 0.6539, "lr": 4.67210444488131e-06, "epoch": 1.654054054054054, "percentage": 16.54, "elapsed_time": "1:19:25", "remaining_time": "6:40:44"} -{"current_steps": 307, "total_steps": 1850, "loss": 0.7214, "lr": 4.669999461765599e-06, "epoch": 1.6594594594594594, "percentage": 16.59, "elapsed_time": "1:19:26", "remaining_time": "6:39:18"} -{"current_steps": 308, "total_steps": 1850, "loss": 0.7451, "lr": 4.6678882209344474e-06, "epoch": 1.6648648648648647, "percentage": 16.65, "elapsed_time": "1:19:28", "remaining_time": "6:37:54"} -{"current_steps": 309, "total_steps": 1850, "loss": 0.6464, "lr": 4.665770728476127e-06, "epoch": 1.6702702702702703, "percentage": 16.7, "elapsed_time": "1:19:32", "remaining_time": "6:36:39"} -{"current_steps": 310, "total_steps": 1850, "loss": 0.6669, "lr": 4.663646990496939e-06, "epoch": 1.6756756756756757, "percentage": 16.76, "elapsed_time": "1:19:37", "remaining_time": "6:35:35"} -{"current_steps": 311, "total_steps": 1850, "loss": 0.8972, "lr": 4.661517013121189e-06, "epoch": 1.6810810810810812, "percentage": 16.81, "elapsed_time": "1:19:41", "remaining_time": "6:34:19"} -{"current_steps": 312, "total_steps": 1850, "loss": 0.6286, "lr": 4.659380802491181e-06, "epoch": 1.6864864864864866, "percentage": 16.86, "elapsed_time": "1:19:41", "remaining_time": "6:32:51"} -{"current_steps": 313, "total_steps": 1850, "loss": 0.3631, "lr": 4.6572383647671915e-06, "epoch": 1.691891891891892, "percentage": 16.92, "elapsed_time": "1:19:44", "remaining_time": "6:31:32"} -{"current_steps": 314, "total_steps": 1850, "loss": 0.5682, "lr": 4.655089706127457e-06, "epoch": 1.6972972972972973, "percentage": 16.97, "elapsed_time": "1:19:46", "remaining_time": "6:30:16"} -{"current_steps": 315, "total_steps": 1850, "loss": 0.5457, "lr": 4.652934832768148e-06, "epoch": 1.7027027027027026, "percentage": 17.03, "elapsed_time": "1:19:52", "remaining_time": "6:29:14"} -{"current_steps": 316, "total_steps": 1850, "loss": 0.6601, "lr": 4.650773750903363e-06, "epoch": 1.708108108108108, "percentage": 17.08, "elapsed_time": "1:19:56", "remaining_time": "6:28:04"} -{"current_steps": 317, "total_steps": 1850, "loss": 0.5882, "lr": 4.6486064667651005e-06, "epoch": 1.7135135135135136, "percentage": 17.14, "elapsed_time": "1:19:58", "remaining_time": "6:26:46"} -{"current_steps": 318, "total_steps": 1850, "loss": 0.7628, "lr": 4.646432986603245e-06, "epoch": 1.718918918918919, "percentage": 17.19, "elapsed_time": "1:20:03", "remaining_time": "6:25:39"} -{"current_steps": 319, "total_steps": 1850, "loss": 0.6877, "lr": 4.644253316685552e-06, "epoch": 1.7243243243243245, "percentage": 17.24, "elapsed_time": "1:20:05", "remaining_time": "6:24:22"} -{"current_steps": 320, "total_steps": 1850, "loss": 0.7026, "lr": 4.6420674632976205e-06, "epoch": 1.7297297297297298, "percentage": 17.3, "elapsed_time": "1:20:08", "remaining_time": "6:23:10"} -{"current_steps": 321, "total_steps": 1850, "loss": 0.5236, "lr": 4.639875432742886e-06, "epoch": 1.7351351351351352, "percentage": 17.35, "elapsed_time": "1:20:09", "remaining_time": "6:21:48"} -{"current_steps": 322, "total_steps": 1850, "loss": 0.6463, "lr": 4.6376772313425975e-06, "epoch": 1.7405405405405405, "percentage": 17.41, "elapsed_time": "1:20:10", "remaining_time": "6:20:27"} -{"current_steps": 323, "total_steps": 1850, "loss": 0.6903, "lr": 4.635472865435795e-06, "epoch": 1.7459459459459459, "percentage": 17.46, "elapsed_time": "1:20:13", "remaining_time": "6:19:16"} -{"current_steps": 324, "total_steps": 1850, "loss": 0.7342, "lr": 4.6332623413792995e-06, "epoch": 1.7513513513513512, "percentage": 17.51, "elapsed_time": "1:20:16", "remaining_time": "6:18:02"} -{"current_steps": 325, "total_steps": 1850, "loss": 0.4302, "lr": 4.6310456655476874e-06, "epoch": 1.7567567567567568, "percentage": 17.57, "elapsed_time": "1:20:18", "remaining_time": "6:16:48"} -{"current_steps": 326, "total_steps": 1850, "loss": 0.5108, "lr": 4.6288228443332786e-06, "epoch": 1.7621621621621621, "percentage": 17.62, "elapsed_time": "1:20:20", "remaining_time": "6:15:32"} -{"current_steps": 327, "total_steps": 1850, "loss": 0.7646, "lr": 4.626593884146111e-06, "epoch": 1.7675675675675677, "percentage": 17.68, "elapsed_time": "1:20:23", "remaining_time": "6:14:25"} -{"current_steps": 328, "total_steps": 1850, "loss": 0.5529, "lr": 4.624358791413928e-06, "epoch": 1.772972972972973, "percentage": 17.73, "elapsed_time": "1:20:25", "remaining_time": "6:13:12"} -{"current_steps": 329, "total_steps": 1850, "loss": 0.609, "lr": 4.622117572582159e-06, "epoch": 1.7783783783783784, "percentage": 17.78, "elapsed_time": "1:20:29", "remaining_time": "6:12:07"} -{"current_steps": 330, "total_steps": 1850, "loss": 0.9146, "lr": 4.619870234113894e-06, "epoch": 1.7837837837837838, "percentage": 17.84, "elapsed_time": "1:20:30", "remaining_time": "6:10:50"} -{"current_steps": 331, "total_steps": 1850, "loss": 0.6887, "lr": 4.617616782489878e-06, "epoch": 1.7891891891891891, "percentage": 17.89, "elapsed_time": "1:20:36", "remaining_time": "6:09:53"} -{"current_steps": 332, "total_steps": 1850, "loss": 0.505, "lr": 4.615357224208477e-06, "epoch": 1.7945945945945945, "percentage": 17.95, "elapsed_time": "1:20:40", "remaining_time": "6:08:53"} -{"current_steps": 333, "total_steps": 1850, "loss": 0.8384, "lr": 4.613091565785674e-06, "epoch": 1.8, "percentage": 18.0, "elapsed_time": "1:20:44", "remaining_time": "6:07:47"} -{"current_steps": 334, "total_steps": 1850, "loss": 0.5512, "lr": 4.610819813755038e-06, "epoch": 1.8054054054054054, "percentage": 18.05, "elapsed_time": "1:20:50", "remaining_time": "6:06:56"} -{"current_steps": 335, "total_steps": 1850, "loss": 0.4877, "lr": 4.608541974667714e-06, "epoch": 1.810810810810811, "percentage": 18.11, "elapsed_time": "1:20:54", "remaining_time": "6:05:54"} -{"current_steps": 336, "total_steps": 1850, "loss": 0.5583, "lr": 4.606258055092397e-06, "epoch": 1.8162162162162163, "percentage": 18.16, "elapsed_time": "1:20:58", "remaining_time": "6:04:50"} -{"current_steps": 337, "total_steps": 1850, "loss": 0.5421, "lr": 4.603968061615321e-06, "epoch": 1.8216216216216217, "percentage": 18.22, "elapsed_time": "1:21:00", "remaining_time": "6:03:43"} -{"current_steps": 338, "total_steps": 1850, "loss": 0.942, "lr": 4.601672000840231e-06, "epoch": 1.827027027027027, "percentage": 18.27, "elapsed_time": "1:21:04", "remaining_time": "6:02:38"} -{"current_steps": 339, "total_steps": 1850, "loss": 0.3773, "lr": 4.5993698793883715e-06, "epoch": 1.8324324324324324, "percentage": 18.32, "elapsed_time": "1:21:06", "remaining_time": "6:01:30"} -{"current_steps": 340, "total_steps": 1850, "loss": 0.9694, "lr": 4.597061703898462e-06, "epoch": 1.8378378378378377, "percentage": 18.38, "elapsed_time": "1:21:08", "remaining_time": "6:00:21"} -{"current_steps": 341, "total_steps": 1850, "loss": 0.4667, "lr": 4.594747481026685e-06, "epoch": 1.8432432432432433, "percentage": 18.43, "elapsed_time": "1:21:12", "remaining_time": "5:59:20"} -{"current_steps": 342, "total_steps": 1850, "loss": 0.4267, "lr": 4.592427217446656e-06, "epoch": 1.8486486486486486, "percentage": 18.49, "elapsed_time": "1:21:13", "remaining_time": "5:58:08"} -{"current_steps": 343, "total_steps": 1850, "loss": 0.9245, "lr": 4.590100919849413e-06, "epoch": 1.8540540540540542, "percentage": 18.54, "elapsed_time": "1:21:16", "remaining_time": "5:57:03"} -{"current_steps": 344, "total_steps": 1850, "loss": 0.7502, "lr": 4.587768594943396e-06, "epoch": 1.8594594594594596, "percentage": 18.59, "elapsed_time": "1:21:23", "remaining_time": "5:56:19"} -{"current_steps": 345, "total_steps": 1850, "loss": 0.4689, "lr": 4.585430249454426e-06, "epoch": 1.864864864864865, "percentage": 18.65, "elapsed_time": "1:21:25", "remaining_time": "5:55:10"} -{"current_steps": 346, "total_steps": 1850, "loss": 0.6188, "lr": 4.583085890125682e-06, "epoch": 1.8702702702702703, "percentage": 18.7, "elapsed_time": "1:21:29", "remaining_time": "5:54:14"} -{"current_steps": 347, "total_steps": 1850, "loss": 0.6352, "lr": 4.5807355237176896e-06, "epoch": 1.8756756756756756, "percentage": 18.76, "elapsed_time": "1:21:35", "remaining_time": "5:53:25"} -{"current_steps": 348, "total_steps": 1850, "loss": 0.464, "lr": 4.578379157008296e-06, "epoch": 1.881081081081081, "percentage": 18.81, "elapsed_time": "1:21:38", "remaining_time": "5:52:24"} -{"current_steps": 349, "total_steps": 1850, "loss": 0.5943, "lr": 4.57601679679265e-06, "epoch": 1.8864864864864865, "percentage": 18.86, "elapsed_time": "1:21:46", "remaining_time": "5:51:41"} -{"current_steps": 350, "total_steps": 1850, "loss": 0.6949, "lr": 4.573648449883188e-06, "epoch": 1.8918918918918919, "percentage": 18.92, "elapsed_time": "1:21:49", "remaining_time": "5:50:41"} -{"current_steps": 351, "total_steps": 1850, "loss": 0.4333, "lr": 4.571274123109606e-06, "epoch": 1.8972972972972975, "percentage": 18.97, "elapsed_time": "1:21:51", "remaining_time": "5:49:33"} -{"current_steps": 352, "total_steps": 1850, "loss": 0.6796, "lr": 4.568893823318847e-06, "epoch": 1.9027027027027028, "percentage": 19.03, "elapsed_time": "1:21:57", "remaining_time": "5:48:47"} -{"current_steps": 353, "total_steps": 1850, "loss": 0.6139, "lr": 4.566507557375077e-06, "epoch": 1.9081081081081082, "percentage": 19.08, "elapsed_time": "1:22:01", "remaining_time": "5:47:50"} -{"current_steps": 354, "total_steps": 1850, "loss": 0.4515, "lr": 4.5641153321596684e-06, "epoch": 1.9135135135135135, "percentage": 19.14, "elapsed_time": "1:22:03", "remaining_time": "5:46:48"} -{"current_steps": 355, "total_steps": 1850, "loss": 0.8426, "lr": 4.56171715457118e-06, "epoch": 1.9189189189189189, "percentage": 19.19, "elapsed_time": "1:22:06", "remaining_time": "5:45:44"} -{"current_steps": 356, "total_steps": 1850, "loss": 0.5806, "lr": 4.559313031525331e-06, "epoch": 1.9243243243243242, "percentage": 19.24, "elapsed_time": "1:22:07", "remaining_time": "5:44:38"} -{"current_steps": 357, "total_steps": 1850, "loss": 0.5927, "lr": 4.55690296995499e-06, "epoch": 1.9297297297297298, "percentage": 19.3, "elapsed_time": "1:22:09", "remaining_time": "5:43:33"} -{"current_steps": 358, "total_steps": 1850, "loss": 0.9986, "lr": 4.554486976810149e-06, "epoch": 1.9351351351351351, "percentage": 19.35, "elapsed_time": "1:22:10", "remaining_time": "5:42:28"} -{"current_steps": 359, "total_steps": 1850, "loss": 0.6813, "lr": 4.552065059057906e-06, "epoch": 1.9405405405405407, "percentage": 19.41, "elapsed_time": "1:22:12", "remaining_time": "5:41:26"} -{"current_steps": 360, "total_steps": 1850, "loss": 1.0832, "lr": 4.549637223682441e-06, "epoch": 1.945945945945946, "percentage": 19.46, "elapsed_time": "1:22:14", "remaining_time": "5:40:24"} -{"current_steps": 361, "total_steps": 1850, "loss": 0.7377, "lr": 4.547203477685005e-06, "epoch": 1.9513513513513514, "percentage": 19.51, "elapsed_time": "1:22:17", "remaining_time": "5:39:25"} -{"current_steps": 362, "total_steps": 1850, "loss": 0.5412, "lr": 4.544763828083888e-06, "epoch": 1.9567567567567568, "percentage": 19.57, "elapsed_time": "1:22:22", "remaining_time": "5:38:36"} -{"current_steps": 363, "total_steps": 1850, "loss": 0.6955, "lr": 4.542318281914405e-06, "epoch": 1.962162162162162, "percentage": 19.62, "elapsed_time": "1:22:26", "remaining_time": "5:37:41"} -{"current_steps": 364, "total_steps": 1850, "loss": 0.6774, "lr": 4.53986684622888e-06, "epoch": 1.9675675675675675, "percentage": 19.68, "elapsed_time": "1:22:30", "remaining_time": "5:36:49"} -{"current_steps": 365, "total_steps": 1850, "loss": 0.5832, "lr": 4.537409528096615e-06, "epoch": 1.972972972972973, "percentage": 19.73, "elapsed_time": "1:22:31", "remaining_time": "5:35:46"} -{"current_steps": 366, "total_steps": 1850, "loss": 0.606, "lr": 4.534946334603879e-06, "epoch": 1.9783783783783784, "percentage": 19.78, "elapsed_time": "1:22:35", "remaining_time": "5:34:51"} -{"current_steps": 367, "total_steps": 1850, "loss": 0.4991, "lr": 4.532477272853882e-06, "epoch": 1.983783783783784, "percentage": 19.84, "elapsed_time": "1:22:38", "remaining_time": "5:33:57"} -{"current_steps": 368, "total_steps": 1850, "loss": 0.4442, "lr": 4.530002349966759e-06, "epoch": 1.9891891891891893, "percentage": 19.89, "elapsed_time": "1:22:40", "remaining_time": "5:32:55"} -{"current_steps": 369, "total_steps": 1850, "loss": 0.6566, "lr": 4.5275215730795445e-06, "epoch": 1.9945945945945946, "percentage": 19.95, "elapsed_time": "1:22:46", "remaining_time": "5:32:12"} -{"current_steps": 370, "total_steps": 1850, "loss": 0.5687, "lr": 4.525034949346156e-06, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "1:22:47", "remaining_time": "5:31:09"} -{"current_steps": 371, "total_steps": 1850, "loss": 0.4458, "lr": 4.522542485937369e-06, "epoch": 2.0054054054054054, "percentage": 20.05, "elapsed_time": "1:48:41", "remaining_time": "7:13:18"} -{"current_steps": 372, "total_steps": 1850, "loss": 0.4418, "lr": 4.5200441900408045e-06, "epoch": 2.0108108108108107, "percentage": 20.11, "elapsed_time": "1:48:44", "remaining_time": "7:12:03"} -{"current_steps": 373, "total_steps": 1850, "loss": 0.7057, "lr": 4.517540068860898e-06, "epoch": 2.016216216216216, "percentage": 20.16, "elapsed_time": "1:48:47", "remaining_time": "7:10:48"} -{"current_steps": 374, "total_steps": 1850, "loss": 0.4491, "lr": 4.515030129618884e-06, "epoch": 2.0216216216216214, "percentage": 20.22, "elapsed_time": "1:48:54", "remaining_time": "7:09:50"} -{"current_steps": 375, "total_steps": 1850, "loss": 0.3571, "lr": 4.512514379552779e-06, "epoch": 2.027027027027027, "percentage": 20.27, "elapsed_time": "1:48:58", "remaining_time": "7:08:38"} -{"current_steps": 376, "total_steps": 1850, "loss": 0.5056, "lr": 4.509992825917352e-06, "epoch": 2.0324324324324325, "percentage": 20.32, "elapsed_time": "1:49:04", "remaining_time": "7:07:36"} -{"current_steps": 377, "total_steps": 1850, "loss": 0.6834, "lr": 4.507465475984109e-06, "epoch": 2.037837837837838, "percentage": 20.38, "elapsed_time": "1:49:06", "remaining_time": "7:06:18"} -{"current_steps": 378, "total_steps": 1850, "loss": 0.6726, "lr": 4.504932337041272e-06, "epoch": 2.0432432432432432, "percentage": 20.43, "elapsed_time": "1:49:09", "remaining_time": "7:05:05"} -{"current_steps": 379, "total_steps": 1850, "loss": 0.4032, "lr": 4.502393416393757e-06, "epoch": 2.0486486486486486, "percentage": 20.49, "elapsed_time": "1:49:11", "remaining_time": "7:03:49"} -{"current_steps": 380, "total_steps": 1850, "loss": 0.5442, "lr": 4.4998487213631515e-06, "epoch": 2.054054054054054, "percentage": 20.54, "elapsed_time": "1:49:15", "remaining_time": "7:02:37"} -{"current_steps": 381, "total_steps": 1850, "loss": 0.6181, "lr": 4.497298259287696e-06, "epoch": 2.0594594594594593, "percentage": 20.59, "elapsed_time": "1:49:16", "remaining_time": "7:01:19"} -{"current_steps": 382, "total_steps": 1850, "loss": 0.3829, "lr": 4.494742037522261e-06, "epoch": 2.064864864864865, "percentage": 20.65, "elapsed_time": "1:49:20", "remaining_time": "7:00:11"} -{"current_steps": 383, "total_steps": 1850, "loss": 0.4953, "lr": 4.4921800634383295e-06, "epoch": 2.0702702702702704, "percentage": 20.7, "elapsed_time": "1:49:22", "remaining_time": "6:58:54"} -{"current_steps": 384, "total_steps": 1850, "loss": 0.3254, "lr": 4.4896123444239655e-06, "epoch": 2.075675675675676, "percentage": 20.76, "elapsed_time": "1:49:23", "remaining_time": "6:57:38"} -{"current_steps": 385, "total_steps": 1850, "loss": 0.555, "lr": 4.487038887883809e-06, "epoch": 2.081081081081081, "percentage": 20.81, "elapsed_time": "1:49:27", "remaining_time": "6:56:29"} -{"current_steps": 386, "total_steps": 1850, "loss": 0.665, "lr": 4.484459701239038e-06, "epoch": 2.0864864864864865, "percentage": 20.86, "elapsed_time": "1:49:31", "remaining_time": "6:55:22"} -{"current_steps": 387, "total_steps": 1850, "loss": 0.2652, "lr": 4.481874791927358e-06, "epoch": 2.091891891891892, "percentage": 20.92, "elapsed_time": "1:49:34", "remaining_time": "6:54:13"} -{"current_steps": 388, "total_steps": 1850, "loss": 0.3811, "lr": 4.479284167402977e-06, "epoch": 2.097297297297297, "percentage": 20.97, "elapsed_time": "1:49:37", "remaining_time": "6:53:05"} -{"current_steps": 389, "total_steps": 1850, "loss": 0.2463, "lr": 4.476687835136585e-06, "epoch": 2.1027027027027025, "percentage": 21.03, "elapsed_time": "1:49:40", "remaining_time": "6:51:54"} -{"current_steps": 390, "total_steps": 1850, "loss": 0.5507, "lr": 4.47408580261533e-06, "epoch": 2.108108108108108, "percentage": 21.08, "elapsed_time": "1:49:45", "remaining_time": "6:50:54"} -{"current_steps": 391, "total_steps": 1850, "loss": 0.288, "lr": 4.471478077342798e-06, "epoch": 2.1135135135135137, "percentage": 21.14, "elapsed_time": "1:49:48", "remaining_time": "6:49:46"} -{"current_steps": 392, "total_steps": 1850, "loss": 0.5169, "lr": 4.468864666838994e-06, "epoch": 2.118918918918919, "percentage": 21.19, "elapsed_time": "1:49:52", "remaining_time": "6:48:41"} -{"current_steps": 393, "total_steps": 1850, "loss": 0.3327, "lr": 4.4662455786403125e-06, "epoch": 2.1243243243243244, "percentage": 21.24, "elapsed_time": "1:49:55", "remaining_time": "6:47:30"} -{"current_steps": 394, "total_steps": 1850, "loss": 0.3877, "lr": 4.463620820299528e-06, "epoch": 2.1297297297297297, "percentage": 21.3, "elapsed_time": "1:50:00", "remaining_time": "6:46:32"} -{"current_steps": 395, "total_steps": 1850, "loss": 0.5425, "lr": 4.4609903993857606e-06, "epoch": 2.135135135135135, "percentage": 21.35, "elapsed_time": "1:50:02", "remaining_time": "6:45:22"} -{"current_steps": 396, "total_steps": 1850, "loss": 0.5257, "lr": 4.458354323484462e-06, "epoch": 2.1405405405405404, "percentage": 21.41, "elapsed_time": "1:50:05", "remaining_time": "6:44:13"} -{"current_steps": 397, "total_steps": 1850, "loss": 0.3914, "lr": 4.45571260019739e-06, "epoch": 2.145945945945946, "percentage": 21.46, "elapsed_time": "1:50:08", "remaining_time": "6:43:05"} -{"current_steps": 398, "total_steps": 1850, "loss": 0.3455, "lr": 4.453065237142592e-06, "epoch": 2.1513513513513516, "percentage": 21.51, "elapsed_time": "1:50:09", "remaining_time": "6:41:54"} -{"current_steps": 399, "total_steps": 1850, "loss": 0.4652, "lr": 4.4504122419543745e-06, "epoch": 2.156756756756757, "percentage": 21.57, "elapsed_time": "1:50:15", "remaining_time": "6:40:58"} -{"current_steps": 400, "total_steps": 1850, "loss": 0.6343, "lr": 4.4477536222832865e-06, "epoch": 2.1621621621621623, "percentage": 21.62, "elapsed_time": "1:50:19", "remaining_time": "6:39:54"} -{"current_steps": 401, "total_steps": 1850, "loss": 0.6975, "lr": 4.445089385796099e-06, "epoch": 2.1675675675675676, "percentage": 21.68, "elapsed_time": "1:50:21", "remaining_time": "6:38:45"} -{"current_steps": 402, "total_steps": 1850, "loss": 0.5779, "lr": 4.442419540175778e-06, "epoch": 2.172972972972973, "percentage": 21.73, "elapsed_time": "1:50:24", "remaining_time": "6:37:41"} -{"current_steps": 403, "total_steps": 1850, "loss": 0.4541, "lr": 4.439744093121465e-06, "epoch": 2.1783783783783783, "percentage": 21.78, "elapsed_time": "1:50:30", "remaining_time": "6:36:45"} -{"current_steps": 404, "total_steps": 1850, "loss": 0.4078, "lr": 4.437063052348457e-06, "epoch": 2.1837837837837837, "percentage": 21.84, "elapsed_time": "1:50:32", "remaining_time": "6:35:37"} -{"current_steps": 405, "total_steps": 1850, "loss": 0.6759, "lr": 4.434376425588179e-06, "epoch": 2.189189189189189, "percentage": 21.89, "elapsed_time": "1:50:34", "remaining_time": "6:34:30"} -{"current_steps": 406, "total_steps": 1850, "loss": 0.2938, "lr": 4.431684220588163e-06, "epoch": 2.1945945945945944, "percentage": 21.95, "elapsed_time": "1:50:38", "remaining_time": "6:33:32"} -{"current_steps": 407, "total_steps": 1850, "loss": 0.676, "lr": 4.428986445112034e-06, "epoch": 2.2, "percentage": 22.0, "elapsed_time": "1:50:41", "remaining_time": "6:32:26"} -{"current_steps": 408, "total_steps": 1850, "loss": 0.1859, "lr": 4.426283106939474e-06, "epoch": 2.2054054054054055, "percentage": 22.05, "elapsed_time": "1:50:44", "remaining_time": "6:31:22"} -{"current_steps": 409, "total_steps": 1850, "loss": 0.2955, "lr": 4.423574213866209e-06, "epoch": 2.210810810810811, "percentage": 22.11, "elapsed_time": "1:50:45", "remaining_time": "6:30:13"} -{"current_steps": 410, "total_steps": 1850, "loss": 0.2262, "lr": 4.420859773703985e-06, "epoch": 2.2162162162162162, "percentage": 22.16, "elapsed_time": "1:50:48", "remaining_time": "6:29:10"} -{"current_steps": 411, "total_steps": 1850, "loss": 0.2273, "lr": 4.418139794280542e-06, "epoch": 2.2216216216216216, "percentage": 22.22, "elapsed_time": "1:50:49", "remaining_time": "6:28:00"} -{"current_steps": 412, "total_steps": 1850, "loss": 0.3282, "lr": 4.415414283439595e-06, "epoch": 2.227027027027027, "percentage": 22.27, "elapsed_time": "1:50:54", "remaining_time": "6:27:04"} -{"current_steps": 413, "total_steps": 1850, "loss": 0.3651, "lr": 4.4126832490408116e-06, "epoch": 2.2324324324324323, "percentage": 22.32, "elapsed_time": "1:51:00", "remaining_time": "6:26:14"} -{"current_steps": 414, "total_steps": 1850, "loss": 0.4052, "lr": 4.409946698959784e-06, "epoch": 2.237837837837838, "percentage": 22.38, "elapsed_time": "1:51:05", "remaining_time": "6:25:19"} -{"current_steps": 415, "total_steps": 1850, "loss": 0.4638, "lr": 4.4072046410880145e-06, "epoch": 2.2432432432432434, "percentage": 22.43, "elapsed_time": "1:51:07", "remaining_time": "6:24:15"} -{"current_steps": 416, "total_steps": 1850, "loss": 0.517, "lr": 4.404457083332887e-06, "epoch": 2.2486486486486488, "percentage": 22.49, "elapsed_time": "1:51:10", "remaining_time": "6:23:14"} -{"current_steps": 417, "total_steps": 1850, "loss": 0.6902, "lr": 4.401704033617643e-06, "epoch": 2.254054054054054, "percentage": 22.54, "elapsed_time": "1:51:13", "remaining_time": "6:22:14"} -{"current_steps": 418, "total_steps": 1850, "loss": 0.3552, "lr": 4.398945499881366e-06, "epoch": 2.2594594594594595, "percentage": 22.59, "elapsed_time": "1:51:18", "remaining_time": "6:21:17"} -{"current_steps": 419, "total_steps": 1850, "loss": 0.286, "lr": 4.396181490078949e-06, "epoch": 2.264864864864865, "percentage": 22.65, "elapsed_time": "1:51:22", "remaining_time": "6:20:21"} -{"current_steps": 420, "total_steps": 1850, "loss": 0.4036, "lr": 4.393412012181082e-06, "epoch": 2.27027027027027, "percentage": 22.7, "elapsed_time": "1:51:23", "remaining_time": "6:19:15"} -{"current_steps": 421, "total_steps": 1850, "loss": 0.8037, "lr": 4.390637074174219e-06, "epoch": 2.2756756756756755, "percentage": 22.76, "elapsed_time": "1:51:27", "remaining_time": "6:18:18"} -{"current_steps": 422, "total_steps": 1850, "loss": 0.2553, "lr": 4.387856684060561e-06, "epoch": 2.281081081081081, "percentage": 22.81, "elapsed_time": "1:51:29", "remaining_time": "6:17:16"} -{"current_steps": 423, "total_steps": 1850, "loss": 0.6222, "lr": 4.385070849858033e-06, "epoch": 2.2864864864864867, "percentage": 22.86, "elapsed_time": "1:51:32", "remaining_time": "6:16:17"} -{"current_steps": 424, "total_steps": 1850, "loss": 0.5326, "lr": 4.382279579600257e-06, "epoch": 2.291891891891892, "percentage": 22.92, "elapsed_time": "1:51:35", "remaining_time": "6:15:17"} -{"current_steps": 425, "total_steps": 1850, "loss": 0.5515, "lr": 4.379482881336532e-06, "epoch": 2.2972972972972974, "percentage": 22.97, "elapsed_time": "1:51:42", "remaining_time": "6:14:31"} -{"current_steps": 426, "total_steps": 1850, "loss": 0.6948, "lr": 4.376680763131811e-06, "epoch": 2.3027027027027027, "percentage": 23.03, "elapsed_time": "1:51:45", "remaining_time": "6:13:33"} -{"current_steps": 427, "total_steps": 1850, "loss": 0.2947, "lr": 4.373873233066676e-06, "epoch": 2.308108108108108, "percentage": 23.08, "elapsed_time": "1:51:49", "remaining_time": "6:12:39"} -{"current_steps": 428, "total_steps": 1850, "loss": 0.2261, "lr": 4.371060299237315e-06, "epoch": 2.3135135135135134, "percentage": 23.14, "elapsed_time": "1:51:50", "remaining_time": "6:11:35"} -{"current_steps": 429, "total_steps": 1850, "loss": 0.5398, "lr": 4.368241969755499e-06, "epoch": 2.3189189189189188, "percentage": 23.19, "elapsed_time": "1:51:53", "remaining_time": "6:10:38"} -{"current_steps": 430, "total_steps": 1850, "loss": 0.3301, "lr": 4.36541825274856e-06, "epoch": 2.3243243243243246, "percentage": 23.24, "elapsed_time": "1:51:56", "remaining_time": "6:09:39"} -{"current_steps": 431, "total_steps": 1850, "loss": 0.6064, "lr": 4.3625891563593635e-06, "epoch": 2.32972972972973, "percentage": 23.3, "elapsed_time": "1:51:59", "remaining_time": "6:08:43"} -{"current_steps": 432, "total_steps": 1850, "loss": 0.3897, "lr": 4.35975468874629e-06, "epoch": 2.3351351351351353, "percentage": 23.35, "elapsed_time": "1:52:05", "remaining_time": "6:07:54"} -{"current_steps": 433, "total_steps": 1850, "loss": 0.271, "lr": 4.356914858083211e-06, "epoch": 2.3405405405405406, "percentage": 23.41, "elapsed_time": "1:52:09", "remaining_time": "6:07:03"} -{"current_steps": 434, "total_steps": 1850, "loss": 0.3681, "lr": 4.354069672559458e-06, "epoch": 2.345945945945946, "percentage": 23.46, "elapsed_time": "1:52:12", "remaining_time": "6:06:06"} -{"current_steps": 435, "total_steps": 1850, "loss": 0.298, "lr": 4.35121914037981e-06, "epoch": 2.3513513513513513, "percentage": 23.51, "elapsed_time": "1:52:16", "remaining_time": "6:05:11"} -{"current_steps": 436, "total_steps": 1850, "loss": 0.3618, "lr": 4.348363269764462e-06, "epoch": 2.3567567567567567, "percentage": 23.57, "elapsed_time": "1:52:20", "remaining_time": "6:04:19"} -{"current_steps": 437, "total_steps": 1850, "loss": 0.8972, "lr": 4.345502068949003e-06, "epoch": 2.362162162162162, "percentage": 23.62, "elapsed_time": "1:52:23", "remaining_time": "6:03:23"} -{"current_steps": 438, "total_steps": 1850, "loss": 0.3939, "lr": 4.342635546184394e-06, "epoch": 2.3675675675675674, "percentage": 23.68, "elapsed_time": "1:52:29", "remaining_time": "6:02:38"} -{"current_steps": 439, "total_steps": 1850, "loss": 0.5462, "lr": 4.339763709736944e-06, "epoch": 2.372972972972973, "percentage": 23.73, "elapsed_time": "1:52:32", "remaining_time": "6:01:43"} -{"current_steps": 440, "total_steps": 1850, "loss": 0.5932, "lr": 4.336886567888283e-06, "epoch": 2.3783783783783785, "percentage": 23.78, "elapsed_time": "1:52:35", "remaining_time": "6:00:48"} -{"current_steps": 441, "total_steps": 1850, "loss": 0.4622, "lr": 4.334004128935342e-06, "epoch": 2.383783783783784, "percentage": 23.84, "elapsed_time": "1:52:37", "remaining_time": "5:59:50"} -{"current_steps": 442, "total_steps": 1850, "loss": 0.5997, "lr": 4.331116401190327e-06, "epoch": 2.389189189189189, "percentage": 23.89, "elapsed_time": "1:52:40", "remaining_time": "5:58:56"} -{"current_steps": 443, "total_steps": 1850, "loss": 0.3072, "lr": 4.328223392980696e-06, "epoch": 2.3945945945945946, "percentage": 23.95, "elapsed_time": "1:52:46", "remaining_time": "5:58:11"} -{"current_steps": 444, "total_steps": 1850, "loss": 0.5338, "lr": 4.325325112649134e-06, "epoch": 2.4, "percentage": 24.0, "elapsed_time": "1:52:48", "remaining_time": "5:57:14"} -{"current_steps": 445, "total_steps": 1850, "loss": 0.3266, "lr": 4.322421568553529e-06, "epoch": 2.4054054054054053, "percentage": 24.05, "elapsed_time": "1:52:51", "remaining_time": "5:56:20"} -{"current_steps": 446, "total_steps": 1850, "loss": 0.4064, "lr": 4.3195127690669494e-06, "epoch": 2.410810810810811, "percentage": 24.11, "elapsed_time": "1:52:55", "remaining_time": "5:55:27"} -{"current_steps": 447, "total_steps": 1850, "loss": 0.3856, "lr": 4.3165987225776186e-06, "epoch": 2.4162162162162164, "percentage": 24.16, "elapsed_time": "1:52:57", "remaining_time": "5:54:32"} -{"current_steps": 448, "total_steps": 1850, "loss": 0.4261, "lr": 4.313679437488889e-06, "epoch": 2.4216216216216218, "percentage": 24.22, "elapsed_time": "1:53:00", "remaining_time": "5:53:38"} -{"current_steps": 449, "total_steps": 1850, "loss": 0.4943, "lr": 4.310754922219223e-06, "epoch": 2.427027027027027, "percentage": 24.27, "elapsed_time": "1:53:03", "remaining_time": "5:52:45"} -{"current_steps": 450, "total_steps": 1850, "loss": 0.2874, "lr": 4.307825185202164e-06, "epoch": 2.4324324324324325, "percentage": 24.32, "elapsed_time": "1:53:07", "remaining_time": "5:51:55"} -{"current_steps": 451, "total_steps": 1850, "loss": 0.4218, "lr": 4.3048902348863116e-06, "epoch": 2.437837837837838, "percentage": 24.38, "elapsed_time": "1:53:10", "remaining_time": "5:51:05"} -{"current_steps": 452, "total_steps": 1850, "loss": 0.4204, "lr": 4.301950079735303e-06, "epoch": 2.443243243243243, "percentage": 24.43, "elapsed_time": "1:53:13", "remaining_time": "5:50:10"} -{"current_steps": 453, "total_steps": 1850, "loss": 0.5593, "lr": 4.299004728227782e-06, "epoch": 2.4486486486486485, "percentage": 24.49, "elapsed_time": "1:53:19", "remaining_time": "5:49:29"} -{"current_steps": 454, "total_steps": 1850, "loss": 0.4187, "lr": 4.2960541888573774e-06, "epoch": 2.454054054054054, "percentage": 24.54, "elapsed_time": "1:53:21", "remaining_time": "5:48:35"} -{"current_steps": 455, "total_steps": 1850, "loss": 0.4193, "lr": 4.29309847013268e-06, "epoch": 2.4594594594594597, "percentage": 24.59, "elapsed_time": "1:53:23", "remaining_time": "5:47:38"} -{"current_steps": 456, "total_steps": 1850, "loss": 0.7035, "lr": 4.290137580577216e-06, "epoch": 2.464864864864865, "percentage": 24.65, "elapsed_time": "1:53:24", "remaining_time": "5:46:41"} -{"current_steps": 457, "total_steps": 1850, "loss": 0.5877, "lr": 4.287171528729423e-06, "epoch": 2.4702702702702704, "percentage": 24.7, "elapsed_time": "1:53:28", "remaining_time": "5:45:53"} -{"current_steps": 458, "total_steps": 1850, "loss": 0.5309, "lr": 4.284200323142623e-06, "epoch": 2.4756756756756757, "percentage": 24.76, "elapsed_time": "1:53:30", "remaining_time": "5:45:00"} -{"current_steps": 459, "total_steps": 1850, "loss": 0.448, "lr": 4.281223972385004e-06, "epoch": 2.481081081081081, "percentage": 24.81, "elapsed_time": "1:53:32", "remaining_time": "5:44:04"} -{"current_steps": 460, "total_steps": 1850, "loss": 0.4453, "lr": 4.27824248503959e-06, "epoch": 2.4864864864864864, "percentage": 24.86, "elapsed_time": "1:53:36", "remaining_time": "5:43:18"} -{"current_steps": 461, "total_steps": 1850, "loss": 0.5582, "lr": 4.275255869704214e-06, "epoch": 2.4918918918918918, "percentage": 24.92, "elapsed_time": "1:53:42", "remaining_time": "5:42:36"} -{"current_steps": 462, "total_steps": 1850, "loss": 0.423, "lr": 4.272264134991503e-06, "epoch": 2.4972972972972975, "percentage": 24.97, "elapsed_time": "1:53:46", "remaining_time": "5:41:48"} -{"current_steps": 463, "total_steps": 1850, "loss": 0.271, "lr": 4.269267289528843e-06, "epoch": 2.5027027027027025, "percentage": 25.03, "elapsed_time": "1:53:50", "remaining_time": "5:41:00"} -{"current_steps": 464, "total_steps": 1850, "loss": 0.6459, "lr": 4.266265341958356e-06, "epoch": 2.5081081081081082, "percentage": 25.08, "elapsed_time": "1:53:51", "remaining_time": "5:40:06"} -{"current_steps": 465, "total_steps": 1850, "loss": 0.2959, "lr": 4.263258300936882e-06, "epoch": 2.5135135135135136, "percentage": 25.14, "elapsed_time": "1:53:52", "remaining_time": "5:39:11"} -{"current_steps": 466, "total_steps": 1850, "loss": 0.3418, "lr": 4.260246175135948e-06, "epoch": 2.518918918918919, "percentage": 25.19, "elapsed_time": "1:54:00", "remaining_time": "5:38:34"} -{"current_steps": 467, "total_steps": 1850, "loss": 0.3459, "lr": 4.257228973241742e-06, "epoch": 2.5243243243243243, "percentage": 25.24, "elapsed_time": "1:54:03", "remaining_time": "5:37:47"} -{"current_steps": 468, "total_steps": 1850, "loss": 0.4769, "lr": 4.254206703955092e-06, "epoch": 2.5297297297297296, "percentage": 25.3, "elapsed_time": "1:54:04", "remaining_time": "5:36:52"} -{"current_steps": 469, "total_steps": 1850, "loss": 0.6487, "lr": 4.251179375991438e-06, "epoch": 2.535135135135135, "percentage": 25.35, "elapsed_time": "1:54:08", "remaining_time": "5:36:05"} -{"current_steps": 470, "total_steps": 1850, "loss": 0.5547, "lr": 4.248146998080808e-06, "epoch": 2.5405405405405403, "percentage": 25.41, "elapsed_time": "1:54:11", "remaining_time": "5:35:17"} -{"current_steps": 471, "total_steps": 1850, "loss": 0.2965, "lr": 4.2451095789677945e-06, "epoch": 2.545945945945946, "percentage": 25.46, "elapsed_time": "1:54:15", "remaining_time": "5:34:30"} -{"current_steps": 472, "total_steps": 1850, "loss": 0.3831, "lr": 4.242067127411525e-06, "epoch": 2.5513513513513515, "percentage": 25.51, "elapsed_time": "1:54:17", "remaining_time": "5:33:40"} -{"current_steps": 473, "total_steps": 1850, "loss": 0.1756, "lr": 4.239019652185642e-06, "epoch": 2.556756756756757, "percentage": 25.57, "elapsed_time": "1:54:19", "remaining_time": "5:32:48"} -{"current_steps": 474, "total_steps": 1850, "loss": 0.5136, "lr": 4.2359671620782725e-06, "epoch": 2.562162162162162, "percentage": 25.62, "elapsed_time": "1:54:23", "remaining_time": "5:32:04"} -{"current_steps": 475, "total_steps": 1850, "loss": 0.6554, "lr": 4.232909665892005e-06, "epoch": 2.5675675675675675, "percentage": 25.68, "elapsed_time": "1:54:26", "remaining_time": "5:31:17"} -{"current_steps": 476, "total_steps": 1850, "loss": 0.3804, "lr": 4.229847172443866e-06, "epoch": 2.572972972972973, "percentage": 25.73, "elapsed_time": "1:54:28", "remaining_time": "5:30:25"} -{"current_steps": 477, "total_steps": 1850, "loss": 0.3338, "lr": 4.2267796905652926e-06, "epoch": 2.5783783783783782, "percentage": 25.78, "elapsed_time": "1:54:30", "remaining_time": "5:29:36"} -{"current_steps": 478, "total_steps": 1850, "loss": 0.6163, "lr": 4.223707229102105e-06, "epoch": 2.583783783783784, "percentage": 25.84, "elapsed_time": "1:54:32", "remaining_time": "5:28:46"} -{"current_steps": 479, "total_steps": 1850, "loss": 0.3005, "lr": 4.220629796914487e-06, "epoch": 2.589189189189189, "percentage": 25.89, "elapsed_time": "1:54:36", "remaining_time": "5:28:01"} -{"current_steps": 480, "total_steps": 1850, "loss": 0.56, "lr": 4.217547402876954e-06, "epoch": 2.5945945945945947, "percentage": 25.95, "elapsed_time": "1:54:37", "remaining_time": "5:27:10"} -{"current_steps": 481, "total_steps": 1850, "loss": 0.4512, "lr": 4.214460055878329e-06, "epoch": 2.6, "percentage": 26.0, "elapsed_time": "1:54:40", "remaining_time": "5:26:21"} -{"current_steps": 482, "total_steps": 1850, "loss": 0.3074, "lr": 4.211367764821722e-06, "epoch": 2.6054054054054054, "percentage": 26.05, "elapsed_time": "1:54:43", "remaining_time": "5:25:37"} -{"current_steps": 483, "total_steps": 1850, "loss": 0.6752, "lr": 4.208270538624497e-06, "epoch": 2.610810810810811, "percentage": 26.11, "elapsed_time": "1:54:48", "remaining_time": "5:24:55"} -{"current_steps": 484, "total_steps": 1850, "loss": 0.2347, "lr": 4.205168386218251e-06, "epoch": 2.616216216216216, "percentage": 26.16, "elapsed_time": "1:54:51", "remaining_time": "5:24:11"} -{"current_steps": 485, "total_steps": 1850, "loss": 0.5189, "lr": 4.2020613165487865e-06, "epoch": 2.6216216216216215, "percentage": 26.22, "elapsed_time": "1:54:53", "remaining_time": "5:23:22"} -{"current_steps": 486, "total_steps": 1850, "loss": 0.7739, "lr": 4.198949338576086e-06, "epoch": 2.627027027027027, "percentage": 26.27, "elapsed_time": "1:54:58", "remaining_time": "5:22:41"} -{"current_steps": 487, "total_steps": 1850, "loss": 0.3495, "lr": 4.1958324612742875e-06, "epoch": 2.6324324324324326, "percentage": 26.32, "elapsed_time": "1:55:03", "remaining_time": "5:22:00"} -{"current_steps": 488, "total_steps": 1850, "loss": 0.2257, "lr": 4.1927106936316564e-06, "epoch": 2.637837837837838, "percentage": 26.38, "elapsed_time": "1:55:04", "remaining_time": "5:21:11"} -{"current_steps": 489, "total_steps": 1850, "loss": 0.6708, "lr": 4.189584044650559e-06, "epoch": 2.6432432432432433, "percentage": 26.43, "elapsed_time": "1:55:07", "remaining_time": "5:20:24"} -{"current_steps": 490, "total_steps": 1850, "loss": 0.3126, "lr": 4.186452523347441e-06, "epoch": 2.6486486486486487, "percentage": 26.49, "elapsed_time": "1:55:08", "remaining_time": "5:19:35"} -{"current_steps": 491, "total_steps": 1850, "loss": 0.4219, "lr": 4.183316138752799e-06, "epoch": 2.654054054054054, "percentage": 26.54, "elapsed_time": "1:55:11", "remaining_time": "5:18:50"} -{"current_steps": 492, "total_steps": 1850, "loss": 0.3937, "lr": 4.180174899911149e-06, "epoch": 2.6594594594594594, "percentage": 26.59, "elapsed_time": "1:55:13", "remaining_time": "5:18:02"} -{"current_steps": 493, "total_steps": 1850, "loss": 0.4098, "lr": 4.177028815881012e-06, "epoch": 2.6648648648648647, "percentage": 26.65, "elapsed_time": "1:55:16", "remaining_time": "5:17:18"} -{"current_steps": 494, "total_steps": 1850, "loss": 0.3597, "lr": 4.173877895734875e-06, "epoch": 2.6702702702702705, "percentage": 26.7, "elapsed_time": "1:55:21", "remaining_time": "5:16:38"} -{"current_steps": 495, "total_steps": 1850, "loss": 0.3284, "lr": 4.1707221485591764e-06, "epoch": 2.6756756756756754, "percentage": 26.76, "elapsed_time": "1:55:25", "remaining_time": "5:15:56"} -{"current_steps": 496, "total_steps": 1850, "loss": 0.257, "lr": 4.167561583454272e-06, "epoch": 2.6810810810810812, "percentage": 26.81, "elapsed_time": "1:55:27", "remaining_time": "5:15:10"} -{"current_steps": 497, "total_steps": 1850, "loss": 0.1819, "lr": 4.164396209534411e-06, "epoch": 2.6864864864864866, "percentage": 26.86, "elapsed_time": "1:55:28", "remaining_time": "5:14:21"} -{"current_steps": 498, "total_steps": 1850, "loss": 0.7109, "lr": 4.161226035927711e-06, "epoch": 2.691891891891892, "percentage": 26.92, "elapsed_time": "1:55:33", "remaining_time": "5:13:43"} -{"current_steps": 499, "total_steps": 1850, "loss": 0.6297, "lr": 4.15805107177613e-06, "epoch": 2.6972972972972973, "percentage": 26.97, "elapsed_time": "1:55:36", "remaining_time": "5:13:01"} -{"current_steps": 500, "total_steps": 1850, "loss": 0.5195, "lr": 4.15487132623544e-06, "epoch": 2.7027027027027026, "percentage": 27.03, "elapsed_time": "1:55:39", "remaining_time": "5:12:17"} -{"current_steps": 501, "total_steps": 1850, "loss": 0.2528, "lr": 4.151686808475204e-06, "epoch": 2.708108108108108, "percentage": 27.08, "elapsed_time": "1:55:41", "remaining_time": "5:11:30"} -{"current_steps": 502, "total_steps": 1850, "loss": 0.5013, "lr": 4.148497527678744e-06, "epoch": 2.7135135135135133, "percentage": 27.14, "elapsed_time": "1:55:43", "remaining_time": "5:10:44"} -{"current_steps": 503, "total_steps": 1850, "loss": 0.4109, "lr": 4.145303493043118e-06, "epoch": 2.718918918918919, "percentage": 27.19, "elapsed_time": "1:55:47", "remaining_time": "5:10:05"} -{"current_steps": 504, "total_steps": 1850, "loss": 0.3197, "lr": 4.1421047137790935e-06, "epoch": 2.7243243243243245, "percentage": 27.24, "elapsed_time": "1:55:51", "remaining_time": "5:09:25"} -{"current_steps": 505, "total_steps": 1850, "loss": 0.6369, "lr": 4.13890119911112e-06, "epoch": 2.72972972972973, "percentage": 27.3, "elapsed_time": "1:55:54", "remaining_time": "5:08:43"} -{"current_steps": 506, "total_steps": 1850, "loss": 0.4581, "lr": 4.135692958277303e-06, "epoch": 2.735135135135135, "percentage": 27.35, "elapsed_time": "1:55:57", "remaining_time": "5:08:00"} -{"current_steps": 507, "total_steps": 1850, "loss": 0.6217, "lr": 4.132480000529375e-06, "epoch": 2.7405405405405405, "percentage": 27.41, "elapsed_time": "1:56:00", "remaining_time": "5:07:17"} -{"current_steps": 508, "total_steps": 1850, "loss": 0.4951, "lr": 4.129262335132676e-06, "epoch": 2.745945945945946, "percentage": 27.46, "elapsed_time": "1:56:07", "remaining_time": "5:06:44"} -{"current_steps": 509, "total_steps": 1850, "loss": 0.2185, "lr": 4.126039971366114e-06, "epoch": 2.7513513513513512, "percentage": 27.51, "elapsed_time": "1:56:09", "remaining_time": "5:06:02"} -{"current_steps": 510, "total_steps": 1850, "loss": 0.5428, "lr": 4.122812918522154e-06, "epoch": 2.756756756756757, "percentage": 27.57, "elapsed_time": "1:56:12", "remaining_time": "5:05:18"} -{"current_steps": 511, "total_steps": 1850, "loss": 0.5466, "lr": 4.119581185906776e-06, "epoch": 2.762162162162162, "percentage": 27.62, "elapsed_time": "1:56:13", "remaining_time": "5:04:32"} -{"current_steps": 512, "total_steps": 1850, "loss": 0.3803, "lr": 4.1163447828394595e-06, "epoch": 2.7675675675675677, "percentage": 27.68, "elapsed_time": "1:56:15", "remaining_time": "5:03:48"} -{"current_steps": 513, "total_steps": 1850, "loss": 0.2722, "lr": 4.113103718653152e-06, "epoch": 2.772972972972973, "percentage": 27.73, "elapsed_time": "1:56:18", "remaining_time": "5:03:07"} -{"current_steps": 514, "total_steps": 1850, "loss": 0.333, "lr": 4.10985800269424e-06, "epoch": 2.7783783783783784, "percentage": 27.78, "elapsed_time": "1:56:21", "remaining_time": "5:02:26"} -{"current_steps": 515, "total_steps": 1850, "loss": 0.2186, "lr": 4.106607644322529e-06, "epoch": 2.7837837837837838, "percentage": 27.84, "elapsed_time": "1:56:23", "remaining_time": "5:01:42"} -{"current_steps": 516, "total_steps": 1850, "loss": 0.6365, "lr": 4.103352652911207e-06, "epoch": 2.789189189189189, "percentage": 27.89, "elapsed_time": "1:56:25", "remaining_time": "5:00:58"} -{"current_steps": 517, "total_steps": 1850, "loss": 0.7261, "lr": 4.100093037846825e-06, "epoch": 2.7945945945945945, "percentage": 27.95, "elapsed_time": "1:56:29", "remaining_time": "5:00:22"} -{"current_steps": 518, "total_steps": 1850, "loss": 0.2767, "lr": 4.0968288085292675e-06, "epoch": 2.8, "percentage": 28.0, "elapsed_time": "1:56:32", "remaining_time": "4:59:40"} -{"current_steps": 519, "total_steps": 1850, "loss": 0.4743, "lr": 4.093559974371725e-06, "epoch": 2.8054054054054056, "percentage": 28.05, "elapsed_time": "1:56:35", "remaining_time": "4:59:00"} -{"current_steps": 520, "total_steps": 1850, "loss": 0.3789, "lr": 4.090286544800667e-06, "epoch": 2.810810810810811, "percentage": 28.11, "elapsed_time": "1:56:43", "remaining_time": "4:58:31"} -{"current_steps": 521, "total_steps": 1850, "loss": 0.6252, "lr": 4.087008529255815e-06, "epoch": 2.8162162162162163, "percentage": 28.16, "elapsed_time": "1:56:49", "remaining_time": "4:57:59"} -{"current_steps": 522, "total_steps": 1850, "loss": 0.3467, "lr": 4.083725937190115e-06, "epoch": 2.8216216216216217, "percentage": 28.22, "elapsed_time": "1:56:51", "remaining_time": "4:57:16"} -{"current_steps": 523, "total_steps": 1850, "loss": 0.3857, "lr": 4.0804387780697114e-06, "epoch": 2.827027027027027, "percentage": 28.27, "elapsed_time": "1:56:54", "remaining_time": "4:56:38"} -{"current_steps": 524, "total_steps": 1850, "loss": 0.4679, "lr": 4.077147061373918e-06, "epoch": 2.8324324324324324, "percentage": 28.32, "elapsed_time": "1:57:01", "remaining_time": "4:56:07"} -{"current_steps": 525, "total_steps": 1850, "loss": 0.2439, "lr": 4.073850796595192e-06, "epoch": 2.8378378378378377, "percentage": 28.38, "elapsed_time": "1:57:02", "remaining_time": "4:55:22"} -{"current_steps": 526, "total_steps": 1850, "loss": 0.435, "lr": 4.070549993239106e-06, "epoch": 2.8432432432432435, "percentage": 28.43, "elapsed_time": "1:57:04", "remaining_time": "4:54:41"} -{"current_steps": 527, "total_steps": 1850, "loss": 0.5022, "lr": 4.06724466082432e-06, "epoch": 2.8486486486486484, "percentage": 28.49, "elapsed_time": "1:57:09", "remaining_time": "4:54:07"} -{"current_steps": 528, "total_steps": 1850, "loss": 0.4282, "lr": 4.063934808882555e-06, "epoch": 2.854054054054054, "percentage": 28.54, "elapsed_time": "1:57:11", "remaining_time": "4:53:26"} -{"current_steps": 529, "total_steps": 1850, "loss": 0.3436, "lr": 4.0606204469585656e-06, "epoch": 2.8594594594594596, "percentage": 28.59, "elapsed_time": "1:57:13", "remaining_time": "4:52:43"} -{"current_steps": 530, "total_steps": 1850, "loss": 0.3889, "lr": 4.057301584610112e-06, "epoch": 2.864864864864865, "percentage": 28.65, "elapsed_time": "1:57:20", "remaining_time": "4:52:15"} -{"current_steps": 531, "total_steps": 1850, "loss": 0.4828, "lr": 4.053978231407931e-06, "epoch": 2.8702702702702703, "percentage": 28.7, "elapsed_time": "1:57:24", "remaining_time": "4:51:37"} -{"current_steps": 532, "total_steps": 1850, "loss": 0.5814, "lr": 4.0506503969357115e-06, "epoch": 2.8756756756756756, "percentage": 28.76, "elapsed_time": "1:57:29", "remaining_time": "4:51:04"} -{"current_steps": 533, "total_steps": 1850, "loss": 0.4768, "lr": 4.047318090790065e-06, "epoch": 2.881081081081081, "percentage": 28.81, "elapsed_time": "1:57:36", "remaining_time": "4:50:35"} -{"current_steps": 534, "total_steps": 1850, "loss": 0.4262, "lr": 4.043981322580498e-06, "epoch": 2.8864864864864863, "percentage": 28.86, "elapsed_time": "1:57:39", "remaining_time": "4:49:57"} -{"current_steps": 535, "total_steps": 1850, "loss": 0.421, "lr": 4.040640101929384e-06, "epoch": 2.891891891891892, "percentage": 28.92, "elapsed_time": "1:57:42", "remaining_time": "4:49:19"} -{"current_steps": 536, "total_steps": 1850, "loss": 0.4019, "lr": 4.037294438471936e-06, "epoch": 2.8972972972972975, "percentage": 28.97, "elapsed_time": "1:57:45", "remaining_time": "4:48:41"} -{"current_steps": 537, "total_steps": 1850, "loss": 0.4322, "lr": 4.033944341856181e-06, "epoch": 2.902702702702703, "percentage": 29.03, "elapsed_time": "1:57:47", "remaining_time": "4:48:01"} -{"current_steps": 538, "total_steps": 1850, "loss": 0.3841, "lr": 4.030589821742926e-06, "epoch": 2.908108108108108, "percentage": 29.08, "elapsed_time": "1:57:52", "remaining_time": "4:47:26"} -{"current_steps": 539, "total_steps": 1850, "loss": 0.7083, "lr": 4.0272308878057385e-06, "epoch": 2.9135135135135135, "percentage": 29.14, "elapsed_time": "1:57:55", "remaining_time": "4:46:49"} -{"current_steps": 540, "total_steps": 1850, "loss": 0.5688, "lr": 4.023867549730912e-06, "epoch": 2.918918918918919, "percentage": 29.19, "elapsed_time": "1:57:57", "remaining_time": "4:46:08"} -{"current_steps": 541, "total_steps": 1850, "loss": 0.5979, "lr": 4.020499817217441e-06, "epoch": 2.924324324324324, "percentage": 29.24, "elapsed_time": "1:57:58", "remaining_time": "4:45:28"} -{"current_steps": 542, "total_steps": 1850, "loss": 0.5034, "lr": 4.017127699976992e-06, "epoch": 2.92972972972973, "percentage": 29.3, "elapsed_time": "1:58:01", "remaining_time": "4:44:48"} -{"current_steps": 543, "total_steps": 1850, "loss": 0.6656, "lr": 4.013751207733877e-06, "epoch": 2.935135135135135, "percentage": 29.35, "elapsed_time": "1:58:05", "remaining_time": "4:44:15"} -{"current_steps": 544, "total_steps": 1850, "loss": 0.2789, "lr": 4.010370350225023e-06, "epoch": 2.9405405405405407, "percentage": 29.41, "elapsed_time": "1:58:08", "remaining_time": "4:43:38"} -{"current_steps": 545, "total_steps": 1850, "loss": 0.2163, "lr": 4.006985137199945e-06, "epoch": 2.945945945945946, "percentage": 29.46, "elapsed_time": "1:58:10", "remaining_time": "4:42:57"} -{"current_steps": 546, "total_steps": 1850, "loss": 0.4179, "lr": 4.00359557842072e-06, "epoch": 2.9513513513513514, "percentage": 29.51, "elapsed_time": "1:58:12", "remaining_time": "4:42:18"} -{"current_steps": 547, "total_steps": 1850, "loss": 0.4683, "lr": 4.000201683661958e-06, "epoch": 2.9567567567567568, "percentage": 29.57, "elapsed_time": "1:58:15", "remaining_time": "4:41:42"} -{"current_steps": 548, "total_steps": 1850, "loss": 0.3506, "lr": 3.996803462710766e-06, "epoch": 2.962162162162162, "percentage": 29.62, "elapsed_time": "1:58:19", "remaining_time": "4:41:08"} -{"current_steps": 549, "total_steps": 1850, "loss": 0.6582, "lr": 3.993400925366736e-06, "epoch": 2.9675675675675675, "percentage": 29.68, "elapsed_time": "1:58:21", "remaining_time": "4:40:29"} -{"current_steps": 550, "total_steps": 1850, "loss": 0.504, "lr": 3.989994081441902e-06, "epoch": 2.972972972972973, "percentage": 29.73, "elapsed_time": "1:58:28", "remaining_time": "4:40:01"} -{"current_steps": 551, "total_steps": 1850, "loss": 0.7362, "lr": 3.986582940760717e-06, "epoch": 2.9783783783783786, "percentage": 29.78, "elapsed_time": "1:58:31", "remaining_time": "4:39:24"} -{"current_steps": 552, "total_steps": 1850, "loss": 0.4116, "lr": 3.983167513160025e-06, "epoch": 2.983783783783784, "percentage": 29.84, "elapsed_time": "1:58:33", "remaining_time": "4:38:48"} -{"current_steps": 553, "total_steps": 1850, "loss": 0.2188, "lr": 3.979747808489036e-06, "epoch": 2.9891891891891893, "percentage": 29.89, "elapsed_time": "1:58:36", "remaining_time": "4:38:09"} -{"current_steps": 554, "total_steps": 1850, "loss": 0.7558, "lr": 3.976323836609289e-06, "epoch": 2.9945945945945946, "percentage": 29.95, "elapsed_time": "1:58:41", "remaining_time": "4:37:40"} -{"current_steps": 555, "total_steps": 1850, "loss": 0.6491, "lr": 3.9728956073946305e-06, "epoch": 3.0, "percentage": 30.0, "elapsed_time": "1:58:43", "remaining_time": "4:37:00"} -{"current_steps": 556, "total_steps": 1850, "loss": 0.1625, "lr": 3.969463130731183e-06, "epoch": 3.0054054054054054, "percentage": 30.05, "elapsed_time": "2:03:17", "remaining_time": "4:46:57"} -{"current_steps": 557, "total_steps": 1850, "loss": 0.311, "lr": 3.966026416517321e-06, "epoch": 3.0108108108108107, "percentage": 30.11, "elapsed_time": "2:03:20", "remaining_time": "4:46:19"} -{"current_steps": 558, "total_steps": 1850, "loss": 0.5299, "lr": 3.962585474663636e-06, "epoch": 3.016216216216216, "percentage": 30.16, "elapsed_time": "2:03:24", "remaining_time": "4:45:43"} -{"current_steps": 559, "total_steps": 1850, "loss": 0.2718, "lr": 3.959140315092911e-06, "epoch": 3.0216216216216214, "percentage": 30.22, "elapsed_time": "2:03:31", "remaining_time": "4:45:15"} -{"current_steps": 560, "total_steps": 1850, "loss": 0.2954, "lr": 3.955690947740092e-06, "epoch": 3.027027027027027, "percentage": 30.27, "elapsed_time": "2:03:32", "remaining_time": "4:44:36"} -{"current_steps": 561, "total_steps": 1850, "loss": 0.2388, "lr": 3.95223738255226e-06, "epoch": 3.0324324324324325, "percentage": 30.32, "elapsed_time": "2:03:39", "remaining_time": "4:44:07"} -{"current_steps": 562, "total_steps": 1850, "loss": 0.2014, "lr": 3.9487796294886015e-06, "epoch": 3.037837837837838, "percentage": 30.38, "elapsed_time": "2:03:42", "remaining_time": "4:43:32"} -{"current_steps": 563, "total_steps": 1850, "loss": 0.2102, "lr": 3.945317698520379e-06, "epoch": 3.0432432432432432, "percentage": 30.43, "elapsed_time": "2:03:45", "remaining_time": "4:42:55"} -{"current_steps": 564, "total_steps": 1850, "loss": 0.499, "lr": 3.941851599630903e-06, "epoch": 3.0486486486486486, "percentage": 30.49, "elapsed_time": "2:03:49", "remaining_time": "4:42:20"} -{"current_steps": 565, "total_steps": 1850, "loss": 0.3392, "lr": 3.938381342815503e-06, "epoch": 3.054054054054054, "percentage": 30.54, "elapsed_time": "2:03:52", "remaining_time": "4:41:44"} -{"current_steps": 566, "total_steps": 1850, "loss": 0.1942, "lr": 3.934906938081499e-06, "epoch": 3.0594594594594593, "percentage": 30.59, "elapsed_time": "2:03:53", "remaining_time": "4:41:04"} -{"current_steps": 567, "total_steps": 1850, "loss": 0.1753, "lr": 3.931428395448174e-06, "epoch": 3.064864864864865, "percentage": 30.65, "elapsed_time": "2:03:56", "remaining_time": "4:40:27"} -{"current_steps": 568, "total_steps": 1850, "loss": 0.2959, "lr": 3.927945724946743e-06, "epoch": 3.0702702702702704, "percentage": 30.7, "elapsed_time": "2:04:02", "remaining_time": "4:39:57"} -{"current_steps": 569, "total_steps": 1850, "loss": 0.4625, "lr": 3.924458936620322e-06, "epoch": 3.075675675675676, "percentage": 30.76, "elapsed_time": "2:04:07", "remaining_time": "4:39:26"} -{"current_steps": 570, "total_steps": 1850, "loss": 0.2571, "lr": 3.920968040523904e-06, "epoch": 3.081081081081081, "percentage": 30.81, "elapsed_time": "2:04:13", "remaining_time": "4:38:57"} -{"current_steps": 571, "total_steps": 1850, "loss": 0.1438, "lr": 3.917473046724329e-06, "epoch": 3.0864864864864865, "percentage": 30.86, "elapsed_time": "2:04:16", "remaining_time": "4:38:20"} -{"current_steps": 572, "total_steps": 1850, "loss": 0.3572, "lr": 3.9139739653002525e-06, "epoch": 3.091891891891892, "percentage": 30.92, "elapsed_time": "2:04:19", "remaining_time": "4:37:46"} -{"current_steps": 573, "total_steps": 1850, "loss": 0.165, "lr": 3.910470806342117e-06, "epoch": 3.097297297297297, "percentage": 30.97, "elapsed_time": "2:04:22", "remaining_time": "4:37:11"} -{"current_steps": 574, "total_steps": 1850, "loss": 0.3209, "lr": 3.9069635799521245e-06, "epoch": 3.1027027027027025, "percentage": 31.03, "elapsed_time": "2:04:24", "remaining_time": "4:36:34"} -{"current_steps": 575, "total_steps": 1850, "loss": 0.1976, "lr": 3.903452296244204e-06, "epoch": 3.108108108108108, "percentage": 31.08, "elapsed_time": "2:04:27", "remaining_time": "4:35:57"} -{"current_steps": 576, "total_steps": 1850, "loss": 0.6074, "lr": 3.899936965343989e-06, "epoch": 3.1135135135135137, "percentage": 31.14, "elapsed_time": "2:04:30", "remaining_time": "4:35:22"} -{"current_steps": 577, "total_steps": 1850, "loss": 0.4051, "lr": 3.89641759738878e-06, "epoch": 3.118918918918919, "percentage": 31.19, "elapsed_time": "2:04:31", "remaining_time": "4:34:44"} -{"current_steps": 578, "total_steps": 1850, "loss": 0.3787, "lr": 3.892894202527523e-06, "epoch": 3.1243243243243244, "percentage": 31.24, "elapsed_time": "2:04:34", "remaining_time": "4:34:09"} -{"current_steps": 579, "total_steps": 1850, "loss": 0.0927, "lr": 3.8893667909207735e-06, "epoch": 3.1297297297297297, "percentage": 31.3, "elapsed_time": "2:04:36", "remaining_time": "4:33:33"} -{"current_steps": 580, "total_steps": 1850, "loss": 0.4706, "lr": 3.88583537274067e-06, "epoch": 3.135135135135135, "percentage": 31.35, "elapsed_time": "2:04:42", "remaining_time": "4:33:03"} -{"current_steps": 581, "total_steps": 1850, "loss": 0.3949, "lr": 3.8822999581709085e-06, "epoch": 3.1405405405405404, "percentage": 31.41, "elapsed_time": "2:04:44", "remaining_time": "4:32:26"} -{"current_steps": 582, "total_steps": 1850, "loss": 0.1971, "lr": 3.878760557406708e-06, "epoch": 3.145945945945946, "percentage": 31.46, "elapsed_time": "2:04:48", "remaining_time": "4:31:54"} -{"current_steps": 583, "total_steps": 1850, "loss": 0.5156, "lr": 3.875217180654779e-06, "epoch": 3.1513513513513516, "percentage": 31.51, "elapsed_time": "2:04:50", "remaining_time": "4:31:18"} -{"current_steps": 584, "total_steps": 1850, "loss": 0.3552, "lr": 3.871669838133303e-06, "epoch": 3.156756756756757, "percentage": 31.57, "elapsed_time": "2:04:52", "remaining_time": "4:30:41"} -{"current_steps": 585, "total_steps": 1850, "loss": 0.4369, "lr": 3.868118540071894e-06, "epoch": 3.1621621621621623, "percentage": 31.62, "elapsed_time": "2:04:53", "remaining_time": "4:30:04"} -{"current_steps": 586, "total_steps": 1850, "loss": 0.3694, "lr": 3.8645632967115755e-06, "epoch": 3.1675675675675676, "percentage": 31.68, "elapsed_time": "2:04:56", "remaining_time": "4:29:28"} -{"current_steps": 587, "total_steps": 1850, "loss": 0.3404, "lr": 3.861004118304746e-06, "epoch": 3.172972972972973, "percentage": 31.73, "elapsed_time": "2:04:59", "remaining_time": "4:28:56"} -{"current_steps": 588, "total_steps": 1850, "loss": 0.3086, "lr": 3.857441015115154e-06, "epoch": 3.1783783783783783, "percentage": 31.78, "elapsed_time": "2:05:03", "remaining_time": "4:28:23"} -{"current_steps": 589, "total_steps": 1850, "loss": 0.253, "lr": 3.8538739974178635e-06, "epoch": 3.1837837837837837, "percentage": 31.84, "elapsed_time": "2:05:07", "remaining_time": "4:27:52"} -{"current_steps": 590, "total_steps": 1850, "loss": 0.2436, "lr": 3.850303075499227e-06, "epoch": 3.189189189189189, "percentage": 31.89, "elapsed_time": "2:05:09", "remaining_time": "4:27:17"} -{"current_steps": 591, "total_steps": 1850, "loss": 0.328, "lr": 3.84672825965686e-06, "epoch": 3.1945945945945944, "percentage": 31.95, "elapsed_time": "2:05:11", "remaining_time": "4:26:42"} -{"current_steps": 592, "total_steps": 1850, "loss": 0.2687, "lr": 3.843149560199601e-06, "epoch": 3.2, "percentage": 32.0, "elapsed_time": "2:05:14", "remaining_time": "4:26:09"} -{"current_steps": 593, "total_steps": 1850, "loss": 0.1417, "lr": 3.839566987447492e-06, "epoch": 3.2054054054054055, "percentage": 32.05, "elapsed_time": "2:05:17", "remaining_time": "4:25:34"} -{"current_steps": 594, "total_steps": 1850, "loss": 0.2106, "lr": 3.835980551731743e-06, "epoch": 3.210810810810811, "percentage": 32.11, "elapsed_time": "2:05:20", "remaining_time": "4:25:00"} -{"current_steps": 595, "total_steps": 1850, "loss": 0.3154, "lr": 3.8323902633947045e-06, "epoch": 3.2162162162162162, "percentage": 32.16, "elapsed_time": "2:05:25", "remaining_time": "4:24:33"} -{"current_steps": 596, "total_steps": 1850, "loss": 0.1218, "lr": 3.828796132789835e-06, "epoch": 3.2216216216216216, "percentage": 32.22, "elapsed_time": "2:05:27", "remaining_time": "4:23:58"} -{"current_steps": 597, "total_steps": 1850, "loss": 0.1336, "lr": 3.825198170281677e-06, "epoch": 3.227027027027027, "percentage": 32.27, "elapsed_time": "2:05:29", "remaining_time": "4:23:22"} -{"current_steps": 598, "total_steps": 1850, "loss": 0.2518, "lr": 3.821596386245819e-06, "epoch": 3.2324324324324323, "percentage": 32.32, "elapsed_time": "2:05:33", "remaining_time": "4:22:51"} -{"current_steps": 599, "total_steps": 1850, "loss": 0.2762, "lr": 3.817990791068874e-06, "epoch": 3.237837837837838, "percentage": 32.38, "elapsed_time": "2:05:39", "remaining_time": "4:22:26"} -{"current_steps": 600, "total_steps": 1850, "loss": 0.2722, "lr": 3.81438139514844e-06, "epoch": 3.2432432432432434, "percentage": 32.43, "elapsed_time": "2:05:44", "remaining_time": "4:21:57"} -{"current_steps": 601, "total_steps": 1850, "loss": 0.3542, "lr": 3.8107682088930797e-06, "epoch": 3.2486486486486488, "percentage": 32.49, "elapsed_time": "2:05:46", "remaining_time": "4:21:23"} -{"current_steps": 602, "total_steps": 1850, "loss": 0.344, "lr": 3.807151242722286e-06, "epoch": 3.254054054054054, "percentage": 32.54, "elapsed_time": "2:05:49", "remaining_time": "4:20:49"} -{"current_steps": 603, "total_steps": 1850, "loss": 0.1625, "lr": 3.8035305070664484e-06, "epoch": 3.2594594594594595, "percentage": 32.59, "elapsed_time": "2:05:52", "remaining_time": "4:20:17"} -{"current_steps": 604, "total_steps": 1850, "loss": 0.2925, "lr": 3.7999060123668318e-06, "epoch": 3.264864864864865, "percentage": 32.65, "elapsed_time": "2:05:57", "remaining_time": "4:19:51"} -{"current_steps": 605, "total_steps": 1850, "loss": 0.1523, "lr": 3.7962777690755364e-06, "epoch": 3.27027027027027, "percentage": 32.7, "elapsed_time": "2:05:59", "remaining_time": "4:19:15"} -{"current_steps": 606, "total_steps": 1850, "loss": 0.1674, "lr": 3.792645787655476e-06, "epoch": 3.2756756756756755, "percentage": 32.76, "elapsed_time": "2:06:03", "remaining_time": "4:18:46"} -{"current_steps": 607, "total_steps": 1850, "loss": 0.2856, "lr": 3.7890100785803425e-06, "epoch": 3.281081081081081, "percentage": 32.81, "elapsed_time": "2:06:06", "remaining_time": "4:18:15"} -{"current_steps": 608, "total_steps": 1850, "loss": 0.1094, "lr": 3.785370652334577e-06, "epoch": 3.2864864864864867, "percentage": 32.86, "elapsed_time": "2:06:09", "remaining_time": "4:17:41"} -{"current_steps": 609, "total_steps": 1850, "loss": 0.2611, "lr": 3.7817275194133403e-06, "epoch": 3.291891891891892, "percentage": 32.92, "elapsed_time": "2:06:12", "remaining_time": "4:17:10"} -{"current_steps": 610, "total_steps": 1850, "loss": 0.1315, "lr": 3.778080690322483e-06, "epoch": 3.2972972972972974, "percentage": 32.97, "elapsed_time": "2:06:14", "remaining_time": "4:16:37"} -{"current_steps": 611, "total_steps": 1850, "loss": 0.1686, "lr": 3.774430175578514e-06, "epoch": 3.3027027027027027, "percentage": 33.03, "elapsed_time": "2:06:15", "remaining_time": "4:16:01"} -{"current_steps": 612, "total_steps": 1850, "loss": 0.4642, "lr": 3.7707759857085706e-06, "epoch": 3.308108108108108, "percentage": 33.08, "elapsed_time": "2:06:17", "remaining_time": "4:15:28"} -{"current_steps": 613, "total_steps": 1850, "loss": 0.1987, "lr": 3.7671181312503886e-06, "epoch": 3.3135135135135134, "percentage": 33.14, "elapsed_time": "2:06:19", "remaining_time": "4:14:54"} -{"current_steps": 614, "total_steps": 1850, "loss": 0.3307, "lr": 3.763456622752271e-06, "epoch": 3.3189189189189188, "percentage": 33.19, "elapsed_time": "2:06:21", "remaining_time": "4:14:22"} -{"current_steps": 615, "total_steps": 1850, "loss": 0.1731, "lr": 3.7597914707730583e-06, "epoch": 3.3243243243243246, "percentage": 33.24, "elapsed_time": "2:06:26", "remaining_time": "4:13:54"} -{"current_steps": 616, "total_steps": 1850, "loss": 0.2003, "lr": 3.7561226858820984e-06, "epoch": 3.32972972972973, "percentage": 33.3, "elapsed_time": "2:06:28", "remaining_time": "4:13:20"} -{"current_steps": 617, "total_steps": 1850, "loss": 0.4014, "lr": 3.7524502786592143e-06, "epoch": 3.3351351351351353, "percentage": 33.35, "elapsed_time": "2:06:30", "remaining_time": "4:12:47"} -{"current_steps": 618, "total_steps": 1850, "loss": 0.205, "lr": 3.7487742596946753e-06, "epoch": 3.3405405405405406, "percentage": 33.41, "elapsed_time": "2:06:33", "remaining_time": "4:12:17"} -{"current_steps": 619, "total_steps": 1850, "loss": 0.2932, "lr": 3.7450946395891674e-06, "epoch": 3.345945945945946, "percentage": 33.46, "elapsed_time": "2:06:36", "remaining_time": "4:11:47"} -{"current_steps": 620, "total_steps": 1850, "loss": 0.2748, "lr": 3.7414114289537593e-06, "epoch": 3.3513513513513513, "percentage": 33.51, "elapsed_time": "2:06:39", "remaining_time": "4:11:16"} -{"current_steps": 621, "total_steps": 1850, "loss": 0.3665, "lr": 3.7377246384098763e-06, "epoch": 3.3567567567567567, "percentage": 33.57, "elapsed_time": "2:06:43", "remaining_time": "4:10:47"} -{"current_steps": 622, "total_steps": 1850, "loss": 0.3453, "lr": 3.7340342785892645e-06, "epoch": 3.362162162162162, "percentage": 33.62, "elapsed_time": "2:06:44", "remaining_time": "4:10:14"} -{"current_steps": 623, "total_steps": 1850, "loss": 0.473, "lr": 3.7303403601339646e-06, "epoch": 3.3675675675675674, "percentage": 33.68, "elapsed_time": "2:06:51", "remaining_time": "4:09:50"} -{"current_steps": 624, "total_steps": 1850, "loss": 0.3017, "lr": 3.726642893696279e-06, "epoch": 3.372972972972973, "percentage": 33.73, "elapsed_time": "2:06:54", "remaining_time": "4:09:20"} -{"current_steps": 625, "total_steps": 1850, "loss": 0.4841, "lr": 3.7229418899387414e-06, "epoch": 3.3783783783783785, "percentage": 33.78, "elapsed_time": "2:06:55", "remaining_time": "4:08:46"} -{"current_steps": 626, "total_steps": 1850, "loss": 0.3879, "lr": 3.719237359534087e-06, "epoch": 3.383783783783784, "percentage": 33.84, "elapsed_time": "2:06:57", "remaining_time": "4:08:14"} -{"current_steps": 627, "total_steps": 1850, "loss": 0.3876, "lr": 3.71552931316522e-06, "epoch": 3.389189189189189, "percentage": 33.89, "elapsed_time": "2:07:05", "remaining_time": "4:07:53"} -{"current_steps": 628, "total_steps": 1850, "loss": 0.4491, "lr": 3.7118177615251834e-06, "epoch": 3.3945945945945946, "percentage": 33.95, "elapsed_time": "2:07:10", "remaining_time": "4:07:28"} -{"current_steps": 629, "total_steps": 1850, "loss": 0.3763, "lr": 3.70810271531713e-06, "epoch": 3.4, "percentage": 34.0, "elapsed_time": "2:07:18", "remaining_time": "4:07:07"} -{"current_steps": 630, "total_steps": 1850, "loss": 0.4171, "lr": 3.7043841852542884e-06, "epoch": 3.4054054054054053, "percentage": 34.05, "elapsed_time": "2:07:20", "remaining_time": "4:06:35"} -{"current_steps": 631, "total_steps": 1850, "loss": 0.2445, "lr": 3.700662182059936e-06, "epoch": 3.410810810810811, "percentage": 34.11, "elapsed_time": "2:07:21", "remaining_time": "4:06:02"} -{"current_steps": 632, "total_steps": 1850, "loss": 0.1347, "lr": 3.696936716467363e-06, "epoch": 3.4162162162162164, "percentage": 34.16, "elapsed_time": "2:07:26", "remaining_time": "4:05:36"} -{"current_steps": 633, "total_steps": 1850, "loss": 0.2822, "lr": 3.693207799219846e-06, "epoch": 3.4216216216216218, "percentage": 34.22, "elapsed_time": "2:07:33", "remaining_time": "4:05:13"} -{"current_steps": 634, "total_steps": 1850, "loss": 0.3425, "lr": 3.689475441070615e-06, "epoch": 3.427027027027027, "percentage": 34.27, "elapsed_time": "2:07:35", "remaining_time": "4:04:43"} -{"current_steps": 635, "total_steps": 1850, "loss": 0.3315, "lr": 3.685739652782822e-06, "epoch": 3.4324324324324325, "percentage": 34.32, "elapsed_time": "2:07:42", "remaining_time": "4:04:21"} -{"current_steps": 636, "total_steps": 1850, "loss": 0.1841, "lr": 3.682000445129512e-06, "epoch": 3.437837837837838, "percentage": 34.38, "elapsed_time": "2:07:44", "remaining_time": "4:03:49"} -{"current_steps": 637, "total_steps": 1850, "loss": 0.3151, "lr": 3.6782578288935896e-06, "epoch": 3.443243243243243, "percentage": 34.43, "elapsed_time": "2:07:47", "remaining_time": "4:03:20"} -{"current_steps": 638, "total_steps": 1850, "loss": 0.1272, "lr": 3.6745118148677882e-06, "epoch": 3.4486486486486485, "percentage": 34.49, "elapsed_time": "2:07:48", "remaining_time": "4:02:47"} -{"current_steps": 639, "total_steps": 1850, "loss": 0.2436, "lr": 3.6707624138546414e-06, "epoch": 3.454054054054054, "percentage": 34.54, "elapsed_time": "2:07:53", "remaining_time": "4:02:22"} -{"current_steps": 640, "total_steps": 1850, "loss": 0.6321, "lr": 3.6670096366664477e-06, "epoch": 3.4594594594594597, "percentage": 34.59, "elapsed_time": "2:07:57", "remaining_time": "4:01:55"} -{"current_steps": 641, "total_steps": 1850, "loss": 0.1262, "lr": 3.663253494125244e-06, "epoch": 3.464864864864865, "percentage": 34.65, "elapsed_time": "2:07:58", "remaining_time": "4:01:22"} -{"current_steps": 642, "total_steps": 1850, "loss": 0.2669, "lr": 3.6594939970627706e-06, "epoch": 3.4702702702702704, "percentage": 34.7, "elapsed_time": "2:08:01", "remaining_time": "4:00:54"} -{"current_steps": 643, "total_steps": 1850, "loss": 0.1228, "lr": 3.655731156320441e-06, "epoch": 3.4756756756756757, "percentage": 34.76, "elapsed_time": "2:08:05", "remaining_time": "4:00:26"} -{"current_steps": 644, "total_steps": 1850, "loss": 0.1759, "lr": 3.651964982749312e-06, "epoch": 3.481081081081081, "percentage": 34.81, "elapsed_time": "2:08:08", "remaining_time": "3:59:58"} -{"current_steps": 645, "total_steps": 1850, "loss": 0.5677, "lr": 3.648195487210051e-06, "epoch": 3.4864864864864864, "percentage": 34.86, "elapsed_time": "2:08:10", "remaining_time": "3:59:28"} -{"current_steps": 646, "total_steps": 1850, "loss": 0.1874, "lr": 3.644422680572906e-06, "epoch": 3.4918918918918918, "percentage": 34.92, "elapsed_time": "2:08:13", "remaining_time": "3:58:58"} -{"current_steps": 647, "total_steps": 1850, "loss": 0.3225, "lr": 3.640646573717671e-06, "epoch": 3.4972972972972975, "percentage": 34.97, "elapsed_time": "2:08:19", "remaining_time": "3:58:36"} -{"current_steps": 648, "total_steps": 1850, "loss": 0.102, "lr": 3.63686717753366e-06, "epoch": 3.5027027027027025, "percentage": 35.03, "elapsed_time": "2:08:21", "remaining_time": "3:58:06"} -{"current_steps": 649, "total_steps": 1850, "loss": 0.1585, "lr": 3.6330845029196697e-06, "epoch": 3.5081081081081082, "percentage": 35.08, "elapsed_time": "2:08:26", "remaining_time": "3:57:40"} -{"current_steps": 650, "total_steps": 1850, "loss": 0.3046, "lr": 3.629298560783952e-06, "epoch": 3.5135135135135136, "percentage": 35.14, "elapsed_time": "2:08:29", "remaining_time": "3:57:12"} -{"current_steps": 651, "total_steps": 1850, "loss": 0.2037, "lr": 3.6255093620441835e-06, "epoch": 3.518918918918919, "percentage": 35.19, "elapsed_time": "2:08:31", "remaining_time": "3:56:42"} -{"current_steps": 652, "total_steps": 1850, "loss": 0.1784, "lr": 3.6217169176274293e-06, "epoch": 3.5243243243243243, "percentage": 35.24, "elapsed_time": "2:08:33", "remaining_time": "3:56:13"} -{"current_steps": 653, "total_steps": 1850, "loss": 0.1974, "lr": 3.6179212384701146e-06, "epoch": 3.5297297297297296, "percentage": 35.3, "elapsed_time": "2:08:35", "remaining_time": "3:55:43"} -{"current_steps": 654, "total_steps": 1850, "loss": 0.2161, "lr": 3.6141223355179946e-06, "epoch": 3.535135135135135, "percentage": 35.35, "elapsed_time": "2:08:38", "remaining_time": "3:55:14"} -{"current_steps": 655, "total_steps": 1850, "loss": 0.1487, "lr": 3.610320219726118e-06, "epoch": 3.5405405405405403, "percentage": 35.41, "elapsed_time": "2:08:42", "remaining_time": "3:54:48"} -{"current_steps": 656, "total_steps": 1850, "loss": 0.2231, "lr": 3.606514902058802e-06, "epoch": 3.545945945945946, "percentage": 35.46, "elapsed_time": "2:08:43", "remaining_time": "3:54:18"} -{"current_steps": 657, "total_steps": 1850, "loss": 0.5068, "lr": 3.602706393489594e-06, "epoch": 3.5513513513513515, "percentage": 35.51, "elapsed_time": "2:08:48", "remaining_time": "3:53:53"} -{"current_steps": 658, "total_steps": 1850, "loss": 0.4621, "lr": 3.598894705001246e-06, "epoch": 3.556756756756757, "percentage": 35.57, "elapsed_time": "2:08:53", "remaining_time": "3:53:30"} -{"current_steps": 659, "total_steps": 1850, "loss": 0.285, "lr": 3.5950798475856783e-06, "epoch": 3.562162162162162, "percentage": 35.62, "elapsed_time": "2:08:57", "remaining_time": "3:53:03"} -{"current_steps": 660, "total_steps": 1850, "loss": 0.4277, "lr": 3.5912618322439487e-06, "epoch": 3.5675675675675675, "percentage": 35.68, "elapsed_time": "2:09:00", "remaining_time": "3:52:37"} -{"current_steps": 661, "total_steps": 1850, "loss": 0.1993, "lr": 3.587440669986224e-06, "epoch": 3.572972972972973, "percentage": 35.73, "elapsed_time": "2:09:03", "remaining_time": "3:52:09"} -{"current_steps": 662, "total_steps": 1850, "loss": 0.272, "lr": 3.5836163718317453e-06, "epoch": 3.5783783783783782, "percentage": 35.78, "elapsed_time": "2:09:09", "remaining_time": "3:51:46"} -{"current_steps": 663, "total_steps": 1850, "loss": 0.6019, "lr": 3.5797889488087946e-06, "epoch": 3.583783783783784, "percentage": 35.84, "elapsed_time": "2:09:12", "remaining_time": "3:51:18"} -{"current_steps": 664, "total_steps": 1850, "loss": 0.3603, "lr": 3.575958411954668e-06, "epoch": 3.589189189189189, "percentage": 35.89, "elapsed_time": "2:09:15", "remaining_time": "3:50:52"} -{"current_steps": 665, "total_steps": 1850, "loss": 0.4656, "lr": 3.5721247723156393e-06, "epoch": 3.5945945945945947, "percentage": 35.95, "elapsed_time": "2:09:16", "remaining_time": "3:50:22"} -{"current_steps": 666, "total_steps": 1850, "loss": 0.2466, "lr": 3.5682880409469316e-06, "epoch": 3.6, "percentage": 36.0, "elapsed_time": "2:09:19", "remaining_time": "3:49:54"} -{"current_steps": 667, "total_steps": 1850, "loss": 0.1848, "lr": 3.564448228912682e-06, "epoch": 3.6054054054054054, "percentage": 36.05, "elapsed_time": "2:09:23", "remaining_time": "3:49:30"} -{"current_steps": 668, "total_steps": 1850, "loss": 0.4968, "lr": 3.5606053472859124e-06, "epoch": 3.610810810810811, "percentage": 36.11, "elapsed_time": "2:09:27", "remaining_time": "3:49:04"} -{"current_steps": 669, "total_steps": 1850, "loss": 0.316, "lr": 3.556759407148496e-06, "epoch": 3.616216216216216, "percentage": 36.16, "elapsed_time": "2:09:29", "remaining_time": "3:48:35"} -{"current_steps": 670, "total_steps": 1850, "loss": 0.2232, "lr": 3.5529104195911258e-06, "epoch": 3.6216216216216215, "percentage": 36.22, "elapsed_time": "2:09:32", "remaining_time": "3:48:08"} -{"current_steps": 671, "total_steps": 1850, "loss": 0.4435, "lr": 3.549058395713285e-06, "epoch": 3.627027027027027, "percentage": 36.27, "elapsed_time": "2:09:35", "remaining_time": "3:47:42"} -{"current_steps": 672, "total_steps": 1850, "loss": 0.1455, "lr": 3.54520334662321e-06, "epoch": 3.6324324324324326, "percentage": 36.32, "elapsed_time": "2:09:37", "remaining_time": "3:47:14"} -{"current_steps": 673, "total_steps": 1850, "loss": 0.3037, "lr": 3.5413452834378626e-06, "epoch": 3.637837837837838, "percentage": 36.38, "elapsed_time": "2:09:42", "remaining_time": "3:46:50"} -{"current_steps": 674, "total_steps": 1850, "loss": 0.4309, "lr": 3.5374842172828953e-06, "epoch": 3.6432432432432433, "percentage": 36.43, "elapsed_time": "2:09:44", "remaining_time": "3:46:22"} -{"current_steps": 675, "total_steps": 1850, "loss": 0.383, "lr": 3.533620159292621e-06, "epoch": 3.6486486486486487, "percentage": 36.49, "elapsed_time": "2:09:47", "remaining_time": "3:45:56"} -{"current_steps": 676, "total_steps": 1850, "loss": 0.1963, "lr": 3.529753120609982e-06, "epoch": 3.654054054054054, "percentage": 36.54, "elapsed_time": "2:09:51", "remaining_time": "3:45:31"} -{"current_steps": 677, "total_steps": 1850, "loss": 0.1922, "lr": 3.5258831123865136e-06, "epoch": 3.6594594594594594, "percentage": 36.59, "elapsed_time": "2:09:53", "remaining_time": "3:45:03"} -{"current_steps": 678, "total_steps": 1850, "loss": 0.5589, "lr": 3.5220101457823147e-06, "epoch": 3.6648648648648647, "percentage": 36.65, "elapsed_time": "2:09:56", "remaining_time": "3:44:37"} -{"current_steps": 679, "total_steps": 1850, "loss": 0.1757, "lr": 3.5181342319660174e-06, "epoch": 3.6702702702702705, "percentage": 36.7, "elapsed_time": "2:09:58", "remaining_time": "3:44:09"} -{"current_steps": 680, "total_steps": 1850, "loss": 0.1208, "lr": 3.5142553821147498e-06, "epoch": 3.6756756756756754, "percentage": 36.76, "elapsed_time": "2:09:59", "remaining_time": "3:43:39"} -{"current_steps": 681, "total_steps": 1850, "loss": 0.2416, "lr": 3.5103736074141106e-06, "epoch": 3.6810810810810812, "percentage": 36.81, "elapsed_time": "2:10:03", "remaining_time": "3:43:14"} -{"current_steps": 682, "total_steps": 1850, "loss": 0.3841, "lr": 3.5064889190581293e-06, "epoch": 3.6864864864864866, "percentage": 36.86, "elapsed_time": "2:10:04", "remaining_time": "3:42:46"} -{"current_steps": 683, "total_steps": 1850, "loss": 0.3723, "lr": 3.5026013282492406e-06, "epoch": 3.691891891891892, "percentage": 36.92, "elapsed_time": "2:10:07", "remaining_time": "3:42:21"} -{"current_steps": 684, "total_steps": 1850, "loss": 0.4403, "lr": 3.498710846198247e-06, "epoch": 3.6972972972972973, "percentage": 36.97, "elapsed_time": "2:10:12", "remaining_time": "3:41:57"} -{"current_steps": 685, "total_steps": 1850, "loss": 0.2813, "lr": 3.494817484124289e-06, "epoch": 3.7027027027027026, "percentage": 37.03, "elapsed_time": "2:10:16", "remaining_time": "3:41:34"} -{"current_steps": 686, "total_steps": 1850, "loss": 0.4287, "lr": 3.490921253254813e-06, "epoch": 3.708108108108108, "percentage": 37.08, "elapsed_time": "2:10:18", "remaining_time": "3:41:06"} -{"current_steps": 687, "total_steps": 1850, "loss": 0.234, "lr": 3.487022164825539e-06, "epoch": 3.7135135135135133, "percentage": 37.14, "elapsed_time": "2:10:23", "remaining_time": "3:40:44"} -{"current_steps": 688, "total_steps": 1850, "loss": 0.2135, "lr": 3.4831202300804246e-06, "epoch": 3.718918918918919, "percentage": 37.19, "elapsed_time": "2:10:29", "remaining_time": "3:40:23"} -{"current_steps": 689, "total_steps": 1850, "loss": 0.2725, "lr": 3.479215460271638e-06, "epoch": 3.7243243243243245, "percentage": 37.24, "elapsed_time": "2:10:32", "remaining_time": "3:39:58"} -{"current_steps": 690, "total_steps": 1850, "loss": 0.228, "lr": 3.475307866659522e-06, "epoch": 3.72972972972973, "percentage": 37.3, "elapsed_time": "2:10:36", "remaining_time": "3:39:33"} -{"current_steps": 691, "total_steps": 1850, "loss": 0.0985, "lr": 3.4713974605125634e-06, "epoch": 3.735135135135135, "percentage": 37.35, "elapsed_time": "2:10:39", "remaining_time": "3:39:08"} -{"current_steps": 692, "total_steps": 1850, "loss": 0.2137, "lr": 3.4674842531073587e-06, "epoch": 3.7405405405405405, "percentage": 37.41, "elapsed_time": "2:10:43", "remaining_time": "3:38:45"} -{"current_steps": 693, "total_steps": 1850, "loss": 0.1707, "lr": 3.4635682557285833e-06, "epoch": 3.745945945945946, "percentage": 37.46, "elapsed_time": "2:10:45", "remaining_time": "3:38:17"} -{"current_steps": 694, "total_steps": 1850, "loss": 0.3021, "lr": 3.459649479668956e-06, "epoch": 3.7513513513513512, "percentage": 37.51, "elapsed_time": "2:10:50", "remaining_time": "3:37:56"} -{"current_steps": 695, "total_steps": 1850, "loss": 0.3457, "lr": 3.4557279362292117e-06, "epoch": 3.756756756756757, "percentage": 37.57, "elapsed_time": "2:10:51", "remaining_time": "3:37:28"} -{"current_steps": 696, "total_steps": 1850, "loss": 0.1193, "lr": 3.451803636718064e-06, "epoch": 3.762162162162162, "percentage": 37.62, "elapsed_time": "2:10:53", "remaining_time": "3:37:00"} -{"current_steps": 697, "total_steps": 1850, "loss": 0.2261, "lr": 3.447876592452174e-06, "epoch": 3.7675675675675677, "percentage": 37.68, "elapsed_time": "2:10:54", "remaining_time": "3:36:33"} -{"current_steps": 698, "total_steps": 1850, "loss": 0.5042, "lr": 3.4439468147561196e-06, "epoch": 3.772972972972973, "percentage": 37.73, "elapsed_time": "2:10:56", "remaining_time": "3:36:07"} -{"current_steps": 699, "total_steps": 1850, "loss": 0.3481, "lr": 3.440014314962358e-06, "epoch": 3.7783783783783784, "percentage": 37.78, "elapsed_time": "2:11:00", "remaining_time": "3:35:42"} -{"current_steps": 700, "total_steps": 1850, "loss": 0.2317, "lr": 3.4360791044112e-06, "epoch": 3.7837837837837838, "percentage": 37.84, "elapsed_time": "2:11:02", "remaining_time": "3:35:16"} -{"current_steps": 701, "total_steps": 1850, "loss": 0.395, "lr": 3.432141194450772e-06, "epoch": 3.789189189189189, "percentage": 37.89, "elapsed_time": "2:11:04", "remaining_time": "3:34:49"} -{"current_steps": 702, "total_steps": 1850, "loss": 0.1767, "lr": 3.4282005964369836e-06, "epoch": 3.7945945945945945, "percentage": 37.95, "elapsed_time": "2:11:06", "remaining_time": "3:34:24"} -{"current_steps": 703, "total_steps": 1850, "loss": 0.2146, "lr": 3.424257321733497e-06, "epoch": 3.8, "percentage": 38.0, "elapsed_time": "2:11:08", "remaining_time": "3:33:58"} -{"current_steps": 704, "total_steps": 1850, "loss": 0.1534, "lr": 3.4203113817116955e-06, "epoch": 3.8054054054054056, "percentage": 38.05, "elapsed_time": "2:11:11", "remaining_time": "3:33:34"} -{"current_steps": 705, "total_steps": 1850, "loss": 0.2513, "lr": 3.4163627877506434e-06, "epoch": 3.810810810810811, "percentage": 38.11, "elapsed_time": "2:11:14", "remaining_time": "3:33:08"} -{"current_steps": 706, "total_steps": 1850, "loss": 0.4154, "lr": 3.4124115512370636e-06, "epoch": 3.8162162162162163, "percentage": 38.16, "elapsed_time": "2:11:16", "remaining_time": "3:32:43"} -{"current_steps": 707, "total_steps": 1850, "loss": 0.1822, "lr": 3.408457683565295e-06, "epoch": 3.8216216216216217, "percentage": 38.22, "elapsed_time": "2:11:20", "remaining_time": "3:32:20"} -{"current_steps": 708, "total_steps": 1850, "loss": 0.3589, "lr": 3.4045011961372675e-06, "epoch": 3.827027027027027, "percentage": 38.27, "elapsed_time": "2:11:24", "remaining_time": "3:31:57"} -{"current_steps": 709, "total_steps": 1850, "loss": 0.4615, "lr": 3.4005421003624637e-06, "epoch": 3.8324324324324324, "percentage": 38.32, "elapsed_time": "2:11:27", "remaining_time": "3:31:34"} -{"current_steps": 710, "total_steps": 1850, "loss": 0.1001, "lr": 3.3965804076578896e-06, "epoch": 3.8378378378378377, "percentage": 38.38, "elapsed_time": "2:11:31", "remaining_time": "3:31:11"} -{"current_steps": 711, "total_steps": 1850, "loss": 0.2788, "lr": 3.392616129448039e-06, "epoch": 3.8432432432432435, "percentage": 38.43, "elapsed_time": "2:11:38", "remaining_time": "3:30:53"} -{"current_steps": 712, "total_steps": 1850, "loss": 0.2663, "lr": 3.3886492771648593e-06, "epoch": 3.8486486486486484, "percentage": 38.49, "elapsed_time": "2:11:42", "remaining_time": "3:30:29"} -{"current_steps": 713, "total_steps": 1850, "loss": 0.3497, "lr": 3.384679862247726e-06, "epoch": 3.854054054054054, "percentage": 38.54, "elapsed_time": "2:11:45", "remaining_time": "3:30:06"} -{"current_steps": 714, "total_steps": 1850, "loss": 0.3613, "lr": 3.3807078961434013e-06, "epoch": 3.8594594594594596, "percentage": 38.59, "elapsed_time": "2:11:49", "remaining_time": "3:29:44"} -{"current_steps": 715, "total_steps": 1850, "loss": 0.0783, "lr": 3.376733390306004e-06, "epoch": 3.864864864864865, "percentage": 38.65, "elapsed_time": "2:11:50", "remaining_time": "3:29:17"} -{"current_steps": 716, "total_steps": 1850, "loss": 0.1617, "lr": 3.372756356196979e-06, "epoch": 3.8702702702702703, "percentage": 38.7, "elapsed_time": "2:11:54", "remaining_time": "3:28:54"} -{"current_steps": 717, "total_steps": 1850, "loss": 0.6444, "lr": 3.3687768052850595e-06, "epoch": 3.8756756756756756, "percentage": 38.76, "elapsed_time": "2:11:56", "remaining_time": "3:28:29"} -{"current_steps": 718, "total_steps": 1850, "loss": 0.4858, "lr": 3.364794749046239e-06, "epoch": 3.881081081081081, "percentage": 38.81, "elapsed_time": "2:11:58", "remaining_time": "3:28:03"} -{"current_steps": 719, "total_steps": 1850, "loss": 0.3103, "lr": 3.3608101989637333e-06, "epoch": 3.8864864864864863, "percentage": 38.86, "elapsed_time": "2:12:03", "remaining_time": "3:27:43"} -{"current_steps": 720, "total_steps": 1850, "loss": 0.2501, "lr": 3.356823166527952e-06, "epoch": 3.891891891891892, "percentage": 38.92, "elapsed_time": "2:12:09", "remaining_time": "3:27:24"} -{"current_steps": 721, "total_steps": 1850, "loss": 0.18, "lr": 3.352833663236463e-06, "epoch": 3.8972972972972975, "percentage": 38.97, "elapsed_time": "2:12:15", "remaining_time": "3:27:05"} -{"current_steps": 722, "total_steps": 1850, "loss": 0.12, "lr": 3.348841700593956e-06, "epoch": 3.902702702702703, "percentage": 39.03, "elapsed_time": "2:12:16", "remaining_time": "3:26:39"} -{"current_steps": 723, "total_steps": 1850, "loss": 0.2618, "lr": 3.3448472901122187e-06, "epoch": 3.908108108108108, "percentage": 39.08, "elapsed_time": "2:12:19", "remaining_time": "3:26:15"} -{"current_steps": 724, "total_steps": 1850, "loss": 0.3689, "lr": 3.340850443310092e-06, "epoch": 3.9135135135135135, "percentage": 39.14, "elapsed_time": "2:12:21", "remaining_time": "3:25:50"} -{"current_steps": 725, "total_steps": 1850, "loss": 0.2195, "lr": 3.336851171713447e-06, "epoch": 3.918918918918919, "percentage": 39.19, "elapsed_time": "2:12:23", "remaining_time": "3:25:26"} -{"current_steps": 726, "total_steps": 1850, "loss": 0.2602, "lr": 3.3328494868551444e-06, "epoch": 3.924324324324324, "percentage": 39.24, "elapsed_time": "2:12:27", "remaining_time": "3:25:04"} -{"current_steps": 727, "total_steps": 1850, "loss": 0.1561, "lr": 3.3288454002750046e-06, "epoch": 3.92972972972973, "percentage": 39.3, "elapsed_time": "2:12:29", "remaining_time": "3:24:40"} -{"current_steps": 728, "total_steps": 1850, "loss": 0.4469, "lr": 3.3248389235197764e-06, "epoch": 3.935135135135135, "percentage": 39.35, "elapsed_time": "2:12:31", "remaining_time": "3:24:15"} -{"current_steps": 729, "total_steps": 1850, "loss": 0.2246, "lr": 3.3208300681430967e-06, "epoch": 3.9405405405405407, "percentage": 39.41, "elapsed_time": "2:12:34", "remaining_time": "3:23:51"} -{"current_steps": 730, "total_steps": 1850, "loss": 0.2743, "lr": 3.3168188457054656e-06, "epoch": 3.945945945945946, "percentage": 39.46, "elapsed_time": "2:12:35", "remaining_time": "3:23:26"} -{"current_steps": 731, "total_steps": 1850, "loss": 0.551, "lr": 3.312805267774209e-06, "epoch": 3.9513513513513514, "percentage": 39.51, "elapsed_time": "2:12:39", "remaining_time": "3:23:03"} -{"current_steps": 732, "total_steps": 1850, "loss": 0.3522, "lr": 3.3087893459234423e-06, "epoch": 3.9567567567567568, "percentage": 39.57, "elapsed_time": "2:12:41", "remaining_time": "3:22:39"} -{"current_steps": 733, "total_steps": 1850, "loss": 0.3084, "lr": 3.304771091734043e-06, "epoch": 3.962162162162162, "percentage": 39.62, "elapsed_time": "2:12:42", "remaining_time": "3:22:14"} -{"current_steps": 734, "total_steps": 1850, "loss": 0.3406, "lr": 3.300750516793614e-06, "epoch": 3.9675675675675675, "percentage": 39.68, "elapsed_time": "2:12:49", "remaining_time": "3:21:56"} -{"current_steps": 735, "total_steps": 1850, "loss": 0.3463, "lr": 3.2967276326964504e-06, "epoch": 3.972972972972973, "percentage": 39.73, "elapsed_time": "2:12:52", "remaining_time": "3:21:34"} -{"current_steps": 736, "total_steps": 1850, "loss": 0.3758, "lr": 3.2927024510435057e-06, "epoch": 3.9783783783783786, "percentage": 39.78, "elapsed_time": "2:12:56", "remaining_time": "3:21:12"} -{"current_steps": 737, "total_steps": 1850, "loss": 0.3328, "lr": 3.2886749834423587e-06, "epoch": 3.983783783783784, "percentage": 39.84, "elapsed_time": "2:12:59", "remaining_time": "3:20:50"} -{"current_steps": 738, "total_steps": 1850, "loss": 0.6213, "lr": 3.284645241507183e-06, "epoch": 3.9891891891891893, "percentage": 39.89, "elapsed_time": "2:13:01", "remaining_time": "3:20:26"} -{"current_steps": 739, "total_steps": 1850, "loss": 0.2463, "lr": 3.280613236858707e-06, "epoch": 3.9945945945945946, "percentage": 39.95, "elapsed_time": "2:13:05", "remaining_time": "3:20:05"} -{"current_steps": 740, "total_steps": 1850, "loss": 0.3501, "lr": 3.2765789811241865e-06, "epoch": 4.0, "percentage": 40.0, "elapsed_time": "2:13:08", "remaining_time": "3:19:42"} -{"current_steps": 741, "total_steps": 1850, "loss": 0.1753, "lr": 3.272542485937369e-06, "epoch": 4.005405405405406, "percentage": 40.05, "elapsed_time": "2:19:14", "remaining_time": "3:28:22"} -{"current_steps": 742, "total_steps": 1850, "loss": 0.0722, "lr": 3.2685037629384587e-06, "epoch": 4.010810810810811, "percentage": 40.11, "elapsed_time": "2:19:14", "remaining_time": "3:27:55"} -{"current_steps": 743, "total_steps": 1850, "loss": 0.2475, "lr": 3.264462823774085e-06, "epoch": 4.0162162162162165, "percentage": 40.16, "elapsed_time": "2:19:19", "remaining_time": "3:27:34"} -{"current_steps": 744, "total_steps": 1850, "loss": 0.1163, "lr": 3.260419680097268e-06, "epoch": 4.021621621621621, "percentage": 40.22, "elapsed_time": "2:19:20", "remaining_time": "3:27:08"} -{"current_steps": 745, "total_steps": 1850, "loss": 0.1325, "lr": 3.2563743435673855e-06, "epoch": 4.027027027027027, "percentage": 40.27, "elapsed_time": "2:19:23", "remaining_time": "3:26:44"} -{"current_steps": 746, "total_steps": 1850, "loss": 0.0466, "lr": 3.252326825850139e-06, "epoch": 4.032432432432432, "percentage": 40.32, "elapsed_time": "2:19:24", "remaining_time": "3:26:19"} -{"current_steps": 747, "total_steps": 1850, "loss": 0.1861, "lr": 3.2482771386175173e-06, "epoch": 4.037837837837838, "percentage": 40.38, "elapsed_time": "2:19:28", "remaining_time": "3:25:57"} -{"current_steps": 748, "total_steps": 1850, "loss": 0.1637, "lr": 3.24422529354777e-06, "epoch": 4.043243243243243, "percentage": 40.43, "elapsed_time": "2:19:31", "remaining_time": "3:25:33"} -{"current_steps": 749, "total_steps": 1850, "loss": 0.1379, "lr": 3.2401713023253646e-06, "epoch": 4.048648648648649, "percentage": 40.49, "elapsed_time": "2:19:36", "remaining_time": "3:25:12"} -{"current_steps": 750, "total_steps": 1850, "loss": 0.1099, "lr": 3.2361151766409628e-06, "epoch": 4.054054054054054, "percentage": 40.54, "elapsed_time": "2:19:37", "remaining_time": "3:24:47"} -{"current_steps": 751, "total_steps": 1850, "loss": 0.1422, "lr": 3.232056928191376e-06, "epoch": 4.059459459459459, "percentage": 40.59, "elapsed_time": "2:19:41", "remaining_time": "3:24:24"} -{"current_steps": 752, "total_steps": 1850, "loss": 0.2716, "lr": 3.2279965686795424e-06, "epoch": 4.064864864864865, "percentage": 40.65, "elapsed_time": "2:19:43", "remaining_time": "3:24:00"} -{"current_steps": 753, "total_steps": 1850, "loss": 0.3849, "lr": 3.2239341098144833e-06, "epoch": 4.07027027027027, "percentage": 40.7, "elapsed_time": "2:19:49", "remaining_time": "3:23:42"} -{"current_steps": 754, "total_steps": 1850, "loss": 0.0768, "lr": 3.219869563311277e-06, "epoch": 4.075675675675676, "percentage": 40.76, "elapsed_time": "2:19:52", "remaining_time": "3:23:19"} -{"current_steps": 755, "total_steps": 1850, "loss": 0.112, "lr": 3.2158029408910213e-06, "epoch": 4.081081081081081, "percentage": 40.81, "elapsed_time": "2:19:54", "remaining_time": "3:22:55"} -{"current_steps": 756, "total_steps": 1850, "loss": 0.1054, "lr": 3.2117342542807995e-06, "epoch": 4.0864864864864865, "percentage": 40.86, "elapsed_time": "2:20:02", "remaining_time": "3:22:38"} -{"current_steps": 757, "total_steps": 1850, "loss": 0.1754, "lr": 3.207663515213648e-06, "epoch": 4.091891891891892, "percentage": 40.92, "elapsed_time": "2:20:06", "remaining_time": "3:22:17"} -{"current_steps": 758, "total_steps": 1850, "loss": 0.191, "lr": 3.2035907354285234e-06, "epoch": 4.097297297297297, "percentage": 40.97, "elapsed_time": "2:20:08", "remaining_time": "3:21:54"} -{"current_steps": 759, "total_steps": 1850, "loss": 0.1083, "lr": 3.1995159266702648e-06, "epoch": 4.102702702702703, "percentage": 41.03, "elapsed_time": "2:20:09", "remaining_time": "3:21:28"} -{"current_steps": 760, "total_steps": 1850, "loss": 0.0609, "lr": 3.1954391006895635e-06, "epoch": 4.108108108108108, "percentage": 41.08, "elapsed_time": "2:20:11", "remaining_time": "3:21:04"} -{"current_steps": 761, "total_steps": 1850, "loss": 0.049, "lr": 3.191360269242928e-06, "epoch": 4.113513513513514, "percentage": 41.14, "elapsed_time": "2:20:13", "remaining_time": "3:20:39"} -{"current_steps": 762, "total_steps": 1850, "loss": 0.1642, "lr": 3.18727944409265e-06, "epoch": 4.118918918918919, "percentage": 41.19, "elapsed_time": "2:20:16", "remaining_time": "3:20:17"} -{"current_steps": 763, "total_steps": 1850, "loss": 0.1513, "lr": 3.1831966370067714e-06, "epoch": 4.124324324324324, "percentage": 41.24, "elapsed_time": "2:20:21", "remaining_time": "3:19:57"} -{"current_steps": 764, "total_steps": 1850, "loss": 0.3276, "lr": 3.1791118597590467e-06, "epoch": 4.12972972972973, "percentage": 41.3, "elapsed_time": "2:20:27", "remaining_time": "3:19:39"} -{"current_steps": 765, "total_steps": 1850, "loss": 0.4011, "lr": 3.1750251241289148e-06, "epoch": 4.135135135135135, "percentage": 41.35, "elapsed_time": "2:20:33", "remaining_time": "3:19:21"} -{"current_steps": 766, "total_steps": 1850, "loss": 0.2274, "lr": 3.1709364419014615e-06, "epoch": 4.140540540540541, "percentage": 41.41, "elapsed_time": "2:20:35", "remaining_time": "3:18:57"} -{"current_steps": 767, "total_steps": 1850, "loss": 0.118, "lr": 3.166845824867384e-06, "epoch": 4.145945945945946, "percentage": 41.46, "elapsed_time": "2:20:38", "remaining_time": "3:18:34"} -{"current_steps": 768, "total_steps": 1850, "loss": 0.1109, "lr": 3.162753284822962e-06, "epoch": 4.151351351351352, "percentage": 41.51, "elapsed_time": "2:20:40", "remaining_time": "3:18:12"} -{"current_steps": 769, "total_steps": 1850, "loss": 0.1754, "lr": 3.1586588335700176e-06, "epoch": 4.1567567567567565, "percentage": 41.57, "elapsed_time": "2:20:42", "remaining_time": "3:17:47"} -{"current_steps": 770, "total_steps": 1850, "loss": 0.1155, "lr": 3.1545624829158873e-06, "epoch": 4.162162162162162, "percentage": 41.62, "elapsed_time": "2:20:46", "remaining_time": "3:17:26"} -{"current_steps": 771, "total_steps": 1850, "loss": 0.0635, "lr": 3.1504642446733828e-06, "epoch": 4.167567567567567, "percentage": 41.68, "elapsed_time": "2:20:49", "remaining_time": "3:17:05"} -{"current_steps": 772, "total_steps": 1850, "loss": 0.1068, "lr": 3.146364130660761e-06, "epoch": 4.172972972972973, "percentage": 41.73, "elapsed_time": "2:20:54", "remaining_time": "3:16:45"} -{"current_steps": 773, "total_steps": 1850, "loss": 0.0637, "lr": 3.142262152701685e-06, "epoch": 4.178378378378379, "percentage": 41.78, "elapsed_time": "2:20:56", "remaining_time": "3:16:22"} -{"current_steps": 774, "total_steps": 1850, "loss": 0.2703, "lr": 3.138158322625197e-06, "epoch": 4.183783783783784, "percentage": 41.84, "elapsed_time": "2:20:59", "remaining_time": "3:15:59"} -{"current_steps": 775, "total_steps": 1850, "loss": 0.2769, "lr": 3.1340526522656765e-06, "epoch": 4.1891891891891895, "percentage": 41.89, "elapsed_time": "2:21:01", "remaining_time": "3:15:37"} -{"current_steps": 776, "total_steps": 1850, "loss": 0.1192, "lr": 3.1299451534628134e-06, "epoch": 4.194594594594594, "percentage": 41.95, "elapsed_time": "2:21:04", "remaining_time": "3:15:15"} -{"current_steps": 777, "total_steps": 1850, "loss": 0.1244, "lr": 3.1258358380615674e-06, "epoch": 4.2, "percentage": 42.0, "elapsed_time": "2:21:10", "remaining_time": "3:14:56"} -{"current_steps": 778, "total_steps": 1850, "loss": 0.2819, "lr": 3.121724717912138e-06, "epoch": 4.205405405405405, "percentage": 42.05, "elapsed_time": "2:21:13", "remaining_time": "3:14:34"} -{"current_steps": 779, "total_steps": 1850, "loss": 0.1018, "lr": 3.1176118048699283e-06, "epoch": 4.210810810810811, "percentage": 42.11, "elapsed_time": "2:21:16", "remaining_time": "3:14:13"} -{"current_steps": 780, "total_steps": 1850, "loss": 0.1842, "lr": 3.113497110795514e-06, "epoch": 4.216216216216216, "percentage": 42.16, "elapsed_time": "2:21:22", "remaining_time": "3:13:56"} -{"current_steps": 781, "total_steps": 1850, "loss": 0.2299, "lr": 3.1093806475546046e-06, "epoch": 4.221621621621622, "percentage": 42.22, "elapsed_time": "2:21:29", "remaining_time": "3:13:40"} -{"current_steps": 782, "total_steps": 1850, "loss": 0.1397, "lr": 3.1052624270180116e-06, "epoch": 4.227027027027027, "percentage": 42.27, "elapsed_time": "2:21:36", "remaining_time": "3:13:23"} -{"current_steps": 783, "total_steps": 1850, "loss": 0.2236, "lr": 3.1011424610616153e-06, "epoch": 4.232432432432432, "percentage": 42.32, "elapsed_time": "2:21:38", "remaining_time": "3:13:00"} -{"current_steps": 784, "total_steps": 1850, "loss": 0.1417, "lr": 3.097020761566328e-06, "epoch": 4.237837837837838, "percentage": 42.38, "elapsed_time": "2:21:41", "remaining_time": "3:12:39"} -{"current_steps": 785, "total_steps": 1850, "loss": 0.1317, "lr": 3.092897340418062e-06, "epoch": 4.243243243243243, "percentage": 42.43, "elapsed_time": "2:21:42", "remaining_time": "3:12:15"} -{"current_steps": 786, "total_steps": 1850, "loss": 0.1869, "lr": 3.088772209507694e-06, "epoch": 4.248648648648649, "percentage": 42.49, "elapsed_time": "2:21:43", "remaining_time": "3:11:51"} -{"current_steps": 787, "total_steps": 1850, "loss": 0.0967, "lr": 3.0846453807310317e-06, "epoch": 4.254054054054054, "percentage": 42.54, "elapsed_time": "2:21:45", "remaining_time": "3:11:28"} -{"current_steps": 788, "total_steps": 1850, "loss": 0.0731, "lr": 3.080516865988778e-06, "epoch": 4.2594594594594595, "percentage": 42.59, "elapsed_time": "2:21:48", "remaining_time": "3:11:07"} -{"current_steps": 789, "total_steps": 1850, "loss": 0.1912, "lr": 3.076386677186498e-06, "epoch": 4.264864864864865, "percentage": 42.65, "elapsed_time": "2:21:50", "remaining_time": "3:10:44"} -{"current_steps": 790, "total_steps": 1850, "loss": 0.2133, "lr": 3.0722548262345854e-06, "epoch": 4.27027027027027, "percentage": 42.7, "elapsed_time": "2:21:54", "remaining_time": "3:10:25"} -{"current_steps": 791, "total_steps": 1850, "loss": 0.4454, "lr": 3.0681213250482255e-06, "epoch": 4.275675675675676, "percentage": 42.76, "elapsed_time": "2:21:56", "remaining_time": "3:10:01"} -{"current_steps": 792, "total_steps": 1850, "loss": 0.3645, "lr": 3.0639861855473637e-06, "epoch": 4.281081081081081, "percentage": 42.81, "elapsed_time": "2:21:58", "remaining_time": "3:09:39"} -{"current_steps": 793, "total_steps": 1850, "loss": 0.1331, "lr": 3.05984941965667e-06, "epoch": 4.286486486486487, "percentage": 42.86, "elapsed_time": "2:21:59", "remaining_time": "3:09:15"} -{"current_steps": 794, "total_steps": 1850, "loss": 0.0863, "lr": 3.055711039305503e-06, "epoch": 4.291891891891892, "percentage": 42.92, "elapsed_time": "2:22:03", "remaining_time": "3:08:55"} -{"current_steps": 795, "total_steps": 1850, "loss": 0.1988, "lr": 3.051571056427879e-06, "epoch": 4.297297297297297, "percentage": 42.97, "elapsed_time": "2:22:06", "remaining_time": "3:08:35"} -{"current_steps": 796, "total_steps": 1850, "loss": 0.2307, "lr": 3.047429482962433e-06, "epoch": 4.302702702702703, "percentage": 43.03, "elapsed_time": "2:22:07", "remaining_time": "3:08:11"} -{"current_steps": 797, "total_steps": 1850, "loss": 0.1614, "lr": 3.0432863308523903e-06, "epoch": 4.308108108108108, "percentage": 43.08, "elapsed_time": "2:22:09", "remaining_time": "3:07:49"} -{"current_steps": 798, "total_steps": 1850, "loss": 0.0683, "lr": 3.039141612045525e-06, "epoch": 4.313513513513514, "percentage": 43.14, "elapsed_time": "2:22:12", "remaining_time": "3:07:27"} -{"current_steps": 799, "total_steps": 1850, "loss": 0.1784, "lr": 3.034995338494131e-06, "epoch": 4.318918918918919, "percentage": 43.19, "elapsed_time": "2:22:15", "remaining_time": "3:07:06"} -{"current_steps": 800, "total_steps": 1850, "loss": 0.0451, "lr": 3.0308475221549868e-06, "epoch": 4.324324324324325, "percentage": 43.24, "elapsed_time": "2:22:16", "remaining_time": "3:06:44"} -{"current_steps": 801, "total_steps": 1850, "loss": 0.0618, "lr": 3.026698174989316e-06, "epoch": 4.3297297297297295, "percentage": 43.3, "elapsed_time": "2:22:20", "remaining_time": "3:06:24"} -{"current_steps": 802, "total_steps": 1850, "loss": 0.1529, "lr": 3.0225473089627617e-06, "epoch": 4.335135135135135, "percentage": 43.35, "elapsed_time": "2:22:25", "remaining_time": "3:06:06"} -{"current_steps": 803, "total_steps": 1850, "loss": 0.4177, "lr": 3.0183949360453442e-06, "epoch": 4.34054054054054, "percentage": 43.41, "elapsed_time": "2:22:30", "remaining_time": "3:05:48"} -{"current_steps": 804, "total_steps": 1850, "loss": 0.1394, "lr": 3.014241068211428e-06, "epoch": 4.345945945945946, "percentage": 43.46, "elapsed_time": "2:22:32", "remaining_time": "3:05:27"} -{"current_steps": 805, "total_steps": 1850, "loss": 0.04, "lr": 3.0100857174396926e-06, "epoch": 4.351351351351352, "percentage": 43.51, "elapsed_time": "2:22:34", "remaining_time": "3:05:04"} -{"current_steps": 806, "total_steps": 1850, "loss": 0.2705, "lr": 3.0059288957130893e-06, "epoch": 4.356756756756757, "percentage": 43.57, "elapsed_time": "2:22:37", "remaining_time": "3:04:44"} -{"current_steps": 807, "total_steps": 1850, "loss": 0.2208, "lr": 3.001770615018815e-06, "epoch": 4.3621621621621625, "percentage": 43.62, "elapsed_time": "2:22:42", "remaining_time": "3:04:26"} -{"current_steps": 808, "total_steps": 1850, "loss": 0.2068, "lr": 2.9976108873482725e-06, "epoch": 4.367567567567567, "percentage": 43.68, "elapsed_time": "2:22:45", "remaining_time": "3:04:06"} -{"current_steps": 809, "total_steps": 1850, "loss": 0.1253, "lr": 2.9934497246970357e-06, "epoch": 4.372972972972973, "percentage": 43.73, "elapsed_time": "2:22:47", "remaining_time": "3:03:44"} -{"current_steps": 810, "total_steps": 1850, "loss": 0.1721, "lr": 2.989287139064819e-06, "epoch": 4.378378378378378, "percentage": 43.78, "elapsed_time": "2:22:49", "remaining_time": "3:03:23"} -{"current_steps": 811, "total_steps": 1850, "loss": 0.134, "lr": 2.9851231424554385e-06, "epoch": 4.383783783783784, "percentage": 43.84, "elapsed_time": "2:22:51", "remaining_time": "3:03:01"} -{"current_steps": 812, "total_steps": 1850, "loss": 0.0818, "lr": 2.9809577468767813e-06, "epoch": 4.389189189189189, "percentage": 43.89, "elapsed_time": "2:22:53", "remaining_time": "3:02:39"} -{"current_steps": 813, "total_steps": 1850, "loss": 0.1797, "lr": 2.9767909643407676e-06, "epoch": 4.394594594594595, "percentage": 43.95, "elapsed_time": "2:22:56", "remaining_time": "3:02:19"} -{"current_steps": 814, "total_steps": 1850, "loss": 0.145, "lr": 2.9726228068633155e-06, "epoch": 4.4, "percentage": 44.0, "elapsed_time": "2:23:00", "remaining_time": "3:02:00"} -{"current_steps": 815, "total_steps": 1850, "loss": 0.079, "lr": 2.9684532864643123e-06, "epoch": 4.405405405405405, "percentage": 44.05, "elapsed_time": "2:23:03", "remaining_time": "3:01:40"} -{"current_steps": 816, "total_steps": 1850, "loss": 0.1763, "lr": 2.9642824151675702e-06, "epoch": 4.410810810810811, "percentage": 44.11, "elapsed_time": "2:23:05", "remaining_time": "3:01:19"} -{"current_steps": 817, "total_steps": 1850, "loss": 0.2654, "lr": 2.9601102050008016e-06, "epoch": 4.416216216216216, "percentage": 44.16, "elapsed_time": "2:23:09", "remaining_time": "3:01:00"} -{"current_steps": 818, "total_steps": 1850, "loss": 0.0779, "lr": 2.955936667995578e-06, "epoch": 4.421621621621622, "percentage": 44.22, "elapsed_time": "2:23:11", "remaining_time": "3:00:39"} -{"current_steps": 819, "total_steps": 1850, "loss": 0.0587, "lr": 2.9517618161872974e-06, "epoch": 4.427027027027027, "percentage": 44.27, "elapsed_time": "2:23:15", "remaining_time": "3:00:20"} -{"current_steps": 820, "total_steps": 1850, "loss": 0.0835, "lr": 2.9475856616151487e-06, "epoch": 4.4324324324324325, "percentage": 44.32, "elapsed_time": "2:23:19", "remaining_time": "3:00:01"} -{"current_steps": 821, "total_steps": 1850, "loss": 0.1748, "lr": 2.9434082163220773e-06, "epoch": 4.437837837837838, "percentage": 44.38, "elapsed_time": "2:23:22", "remaining_time": "2:59:41"} -{"current_steps": 822, "total_steps": 1850, "loss": 0.119, "lr": 2.9392294923547543e-06, "epoch": 4.443243243243243, "percentage": 44.43, "elapsed_time": "2:23:24", "remaining_time": "2:59:21"} -{"current_steps": 823, "total_steps": 1850, "loss": 0.1535, "lr": 2.9350495017635334e-06, "epoch": 4.448648648648649, "percentage": 44.49, "elapsed_time": "2:23:26", "remaining_time": "2:58:59"} -{"current_steps": 824, "total_steps": 1850, "loss": 0.2561, "lr": 2.9308682566024228e-06, "epoch": 4.454054054054054, "percentage": 44.54, "elapsed_time": "2:23:31", "remaining_time": "2:58:42"} -{"current_steps": 825, "total_steps": 1850, "loss": 0.2024, "lr": 2.92668576892905e-06, "epoch": 4.45945945945946, "percentage": 44.59, "elapsed_time": "2:23:35", "remaining_time": "2:58:24"} -{"current_steps": 826, "total_steps": 1850, "loss": 0.0436, "lr": 2.9225020508046233e-06, "epoch": 4.464864864864865, "percentage": 44.65, "elapsed_time": "2:23:37", "remaining_time": "2:58:02"} -{"current_steps": 827, "total_steps": 1850, "loss": 0.1636, "lr": 2.9183171142939002e-06, "epoch": 4.47027027027027, "percentage": 44.7, "elapsed_time": "2:23:39", "remaining_time": "2:57:42"} -{"current_steps": 828, "total_steps": 1850, "loss": 0.0962, "lr": 2.9141309714651528e-06, "epoch": 4.475675675675676, "percentage": 44.76, "elapsed_time": "2:23:42", "remaining_time": "2:57:22"} -{"current_steps": 829, "total_steps": 1850, "loss": 0.2129, "lr": 2.9099436343901306e-06, "epoch": 4.481081081081081, "percentage": 44.81, "elapsed_time": "2:23:47", "remaining_time": "2:57:05"} -{"current_steps": 830, "total_steps": 1850, "loss": 0.2872, "lr": 2.9057551151440266e-06, "epoch": 4.486486486486487, "percentage": 44.86, "elapsed_time": "2:23:49", "remaining_time": "2:56:44"} -{"current_steps": 831, "total_steps": 1850, "loss": 0.3254, "lr": 2.9015654258054433e-06, "epoch": 4.491891891891892, "percentage": 44.92, "elapsed_time": "2:23:55", "remaining_time": "2:56:29"} -{"current_steps": 832, "total_steps": 1850, "loss": 0.1417, "lr": 2.8973745784563596e-06, "epoch": 4.4972972972972975, "percentage": 44.97, "elapsed_time": "2:23:58", "remaining_time": "2:56:09"} -{"current_steps": 833, "total_steps": 1850, "loss": 0.2513, "lr": 2.8931825851820904e-06, "epoch": 4.5027027027027025, "percentage": 45.03, "elapsed_time": "2:24:01", "remaining_time": "2:55:50"} -{"current_steps": 834, "total_steps": 1850, "loss": 0.1785, "lr": 2.8889894580712574e-06, "epoch": 4.508108108108108, "percentage": 45.08, "elapsed_time": "2:24:04", "remaining_time": "2:55:31"} -{"current_steps": 835, "total_steps": 1850, "loss": 0.2853, "lr": 2.884795209215751e-06, "epoch": 4.513513513513513, "percentage": 45.14, "elapsed_time": "2:24:08", "remaining_time": "2:55:12"} -{"current_steps": 836, "total_steps": 1850, "loss": 0.2947, "lr": 2.880599850710696e-06, "epoch": 4.518918918918919, "percentage": 45.19, "elapsed_time": "2:24:11", "remaining_time": "2:54:53"} -{"current_steps": 837, "total_steps": 1850, "loss": 0.177, "lr": 2.8764033946544197e-06, "epoch": 4.524324324324324, "percentage": 45.24, "elapsed_time": "2:24:17", "remaining_time": "2:54:37"} -{"current_steps": 838, "total_steps": 1850, "loss": 0.2786, "lr": 2.8722058531484105e-06, "epoch": 4.52972972972973, "percentage": 45.3, "elapsed_time": "2:24:18", "remaining_time": "2:54:16"} -{"current_steps": 839, "total_steps": 1850, "loss": 0.1881, "lr": 2.86800723829729e-06, "epoch": 4.535135135135135, "percentage": 45.35, "elapsed_time": "2:24:20", "remaining_time": "2:53:56"} -{"current_steps": 840, "total_steps": 1850, "loss": 0.3541, "lr": 2.8638075622087747e-06, "epoch": 4.54054054054054, "percentage": 45.41, "elapsed_time": "2:24:23", "remaining_time": "2:53:36"} -{"current_steps": 841, "total_steps": 1850, "loss": 0.3094, "lr": 2.8596068369936386e-06, "epoch": 4.545945945945946, "percentage": 45.46, "elapsed_time": "2:24:26", "remaining_time": "2:53:18"} -{"current_steps": 842, "total_steps": 1850, "loss": 0.1162, "lr": 2.8554050747656862e-06, "epoch": 4.551351351351351, "percentage": 45.51, "elapsed_time": "2:24:29", "remaining_time": "2:52:59"} -{"current_steps": 843, "total_steps": 1850, "loss": 0.1079, "lr": 2.851202287641709e-06, "epoch": 4.556756756756757, "percentage": 45.57, "elapsed_time": "2:24:32", "remaining_time": "2:52:39"} -{"current_steps": 844, "total_steps": 1850, "loss": 0.4462, "lr": 2.8469984877414525e-06, "epoch": 4.562162162162162, "percentage": 45.62, "elapsed_time": "2:24:35", "remaining_time": "2:52:20"} -{"current_steps": 845, "total_steps": 1850, "loss": 0.0851, "lr": 2.842793687187588e-06, "epoch": 4.5675675675675675, "percentage": 45.68, "elapsed_time": "2:24:37", "remaining_time": "2:52:00"} -{"current_steps": 846, "total_steps": 1850, "loss": 0.1268, "lr": 2.8385878981056663e-06, "epoch": 4.572972972972973, "percentage": 45.73, "elapsed_time": "2:24:39", "remaining_time": "2:51:40"} -{"current_steps": 847, "total_steps": 1850, "loss": 0.3187, "lr": 2.8343811326240944e-06, "epoch": 4.578378378378378, "percentage": 45.78, "elapsed_time": "2:24:44", "remaining_time": "2:51:23"} -{"current_steps": 848, "total_steps": 1850, "loss": 0.1315, "lr": 2.830173402874091e-06, "epoch": 4.583783783783784, "percentage": 45.84, "elapsed_time": "2:24:51", "remaining_time": "2:51:10"} -{"current_steps": 849, "total_steps": 1850, "loss": 0.301, "lr": 2.8259647209896573e-06, "epoch": 4.589189189189189, "percentage": 45.89, "elapsed_time": "2:24:54", "remaining_time": "2:50:51"} -{"current_steps": 850, "total_steps": 1850, "loss": 0.1478, "lr": 2.821755099107541e-06, "epoch": 4.594594594594595, "percentage": 45.95, "elapsed_time": "2:24:56", "remaining_time": "2:50:31"} -{"current_steps": 851, "total_steps": 1850, "loss": 0.2029, "lr": 2.817544549367197e-06, "epoch": 4.6, "percentage": 46.0, "elapsed_time": "2:25:00", "remaining_time": "2:50:13"} -{"current_steps": 852, "total_steps": 1850, "loss": 0.0549, "lr": 2.813333083910761e-06, "epoch": 4.605405405405405, "percentage": 46.05, "elapsed_time": "2:25:03", "remaining_time": "2:49:55"} -{"current_steps": 853, "total_steps": 1850, "loss": 0.1508, "lr": 2.8091207148830046e-06, "epoch": 4.610810810810811, "percentage": 46.11, "elapsed_time": "2:25:08", "remaining_time": "2:49:38"} -{"current_steps": 854, "total_steps": 1850, "loss": 0.1094, "lr": 2.8049074544313094e-06, "epoch": 4.616216216216216, "percentage": 46.16, "elapsed_time": "2:25:11", "remaining_time": "2:49:19"} -{"current_steps": 855, "total_steps": 1850, "loss": 0.0799, "lr": 2.8006933147056236e-06, "epoch": 4.621621621621622, "percentage": 46.22, "elapsed_time": "2:25:14", "remaining_time": "2:49:01"} -{"current_steps": 856, "total_steps": 1850, "loss": 0.123, "lr": 2.7964783078584336e-06, "epoch": 4.627027027027027, "percentage": 46.27, "elapsed_time": "2:25:17", "remaining_time": "2:48:42"} -{"current_steps": 857, "total_steps": 1850, "loss": 0.0692, "lr": 2.792262446044725e-06, "epoch": 4.632432432432433, "percentage": 46.32, "elapsed_time": "2:25:21", "remaining_time": "2:48:25"} -{"current_steps": 858, "total_steps": 1850, "loss": 0.1596, "lr": 2.788045741421949e-06, "epoch": 4.6378378378378375, "percentage": 46.38, "elapsed_time": "2:25:26", "remaining_time": "2:48:09"} -{"current_steps": 859, "total_steps": 1850, "loss": 0.047, "lr": 2.78382820614999e-06, "epoch": 4.643243243243243, "percentage": 46.43, "elapsed_time": "2:25:28", "remaining_time": "2:47:49"} -{"current_steps": 860, "total_steps": 1850, "loss": 0.1561, "lr": 2.779609852391123e-06, "epoch": 4.648648648648649, "percentage": 46.49, "elapsed_time": "2:25:31", "remaining_time": "2:47:31"} -{"current_steps": 861, "total_steps": 1850, "loss": 0.2157, "lr": 2.775390692309987e-06, "epoch": 4.654054054054054, "percentage": 46.54, "elapsed_time": "2:25:35", "remaining_time": "2:47:13"} -{"current_steps": 862, "total_steps": 1850, "loss": 0.0782, "lr": 2.7711707380735443e-06, "epoch": 4.65945945945946, "percentage": 46.59, "elapsed_time": "2:25:37", "remaining_time": "2:46:54"} -{"current_steps": 863, "total_steps": 1850, "loss": 0.2994, "lr": 2.766950001851049e-06, "epoch": 4.664864864864865, "percentage": 46.65, "elapsed_time": "2:25:43", "remaining_time": "2:46:39"} -{"current_steps": 864, "total_steps": 1850, "loss": 0.109, "lr": 2.7627284958140084e-06, "epoch": 4.6702702702702705, "percentage": 46.7, "elapsed_time": "2:25:48", "remaining_time": "2:46:24"} -{"current_steps": 865, "total_steps": 1850, "loss": 0.2557, "lr": 2.7585062321361517e-06, "epoch": 4.675675675675675, "percentage": 46.76, "elapsed_time": "2:25:52", "remaining_time": "2:46:06"} -{"current_steps": 866, "total_steps": 1850, "loss": 0.0413, "lr": 2.75428322299339e-06, "epoch": 4.681081081081081, "percentage": 46.81, "elapsed_time": "2:25:53", "remaining_time": "2:45:45"} -{"current_steps": 867, "total_steps": 1850, "loss": 0.0402, "lr": 2.7500594805637882e-06, "epoch": 4.686486486486486, "percentage": 46.86, "elapsed_time": "2:25:54", "remaining_time": "2:45:25"} -{"current_steps": 868, "total_steps": 1850, "loss": 0.1481, "lr": 2.745835017027522e-06, "epoch": 4.691891891891892, "percentage": 46.92, "elapsed_time": "2:26:00", "remaining_time": "2:45:11"} -{"current_steps": 869, "total_steps": 1850, "loss": 0.2242, "lr": 2.74160984456685e-06, "epoch": 4.697297297297297, "percentage": 46.97, "elapsed_time": "2:26:03", "remaining_time": "2:44:53"} -{"current_steps": 870, "total_steps": 1850, "loss": 0.4693, "lr": 2.737383975366071e-06, "epoch": 4.702702702702703, "percentage": 47.03, "elapsed_time": "2:26:05", "remaining_time": "2:44:33"} -{"current_steps": 871, "total_steps": 1850, "loss": 0.1353, "lr": 2.7331574216114963e-06, "epoch": 4.708108108108108, "percentage": 47.08, "elapsed_time": "2:26:11", "remaining_time": "2:44:19"} -{"current_steps": 872, "total_steps": 1850, "loss": 0.157, "lr": 2.728930195491411e-06, "epoch": 4.713513513513513, "percentage": 47.14, "elapsed_time": "2:26:15", "remaining_time": "2:44:01"} -{"current_steps": 873, "total_steps": 1850, "loss": 0.1863, "lr": 2.724702309196038e-06, "epoch": 4.718918918918919, "percentage": 47.19, "elapsed_time": "2:26:17", "remaining_time": "2:43:42"} -{"current_steps": 874, "total_steps": 1850, "loss": 0.2874, "lr": 2.720473774917505e-06, "epoch": 4.724324324324324, "percentage": 47.24, "elapsed_time": "2:26:21", "remaining_time": "2:43:26"} -{"current_steps": 875, "total_steps": 1850, "loss": 0.1021, "lr": 2.716244604849807e-06, "epoch": 4.72972972972973, "percentage": 47.3, "elapsed_time": "2:26:23", "remaining_time": "2:43:07"} -{"current_steps": 876, "total_steps": 1850, "loss": 0.1046, "lr": 2.7120148111887732e-06, "epoch": 4.735135135135135, "percentage": 47.35, "elapsed_time": "2:26:27", "remaining_time": "2:42:51"} -{"current_steps": 877, "total_steps": 1850, "loss": 0.0971, "lr": 2.707784406132032e-06, "epoch": 4.7405405405405405, "percentage": 47.41, "elapsed_time": "2:26:29", "remaining_time": "2:42:31"} -{"current_steps": 878, "total_steps": 1850, "loss": 0.0507, "lr": 2.703553401878972e-06, "epoch": 4.745945945945946, "percentage": 47.46, "elapsed_time": "2:26:31", "remaining_time": "2:42:12"} -{"current_steps": 879, "total_steps": 1850, "loss": 0.0616, "lr": 2.6993218106307146e-06, "epoch": 4.751351351351351, "percentage": 47.51, "elapsed_time": "2:26:33", "remaining_time": "2:41:54"} -{"current_steps": 880, "total_steps": 1850, "loss": 0.0908, "lr": 2.6950896445900685e-06, "epoch": 4.756756756756757, "percentage": 47.57, "elapsed_time": "2:26:35", "remaining_time": "2:41:34"} -{"current_steps": 881, "total_steps": 1850, "loss": 0.2426, "lr": 2.690856915961504e-06, "epoch": 4.762162162162162, "percentage": 47.62, "elapsed_time": "2:26:37", "remaining_time": "2:41:16"} -{"current_steps": 882, "total_steps": 1850, "loss": 0.1881, "lr": 2.686623636951112e-06, "epoch": 4.767567567567568, "percentage": 47.68, "elapsed_time": "2:26:40", "remaining_time": "2:40:58"} -{"current_steps": 883, "total_steps": 1850, "loss": 0.1385, "lr": 2.6823898197665703e-06, "epoch": 4.772972972972973, "percentage": 47.73, "elapsed_time": "2:26:42", "remaining_time": "2:40:39"} -{"current_steps": 884, "total_steps": 1850, "loss": 0.2913, "lr": 2.6781554766171104e-06, "epoch": 4.778378378378378, "percentage": 47.78, "elapsed_time": "2:26:44", "remaining_time": "2:40:21"} -{"current_steps": 885, "total_steps": 1850, "loss": 0.0874, "lr": 2.673920619713478e-06, "epoch": 4.783783783783784, "percentage": 47.84, "elapsed_time": "2:26:51", "remaining_time": "2:40:07"} -{"current_steps": 886, "total_steps": 1850, "loss": 0.2703, "lr": 2.6696852612679024e-06, "epoch": 4.789189189189189, "percentage": 47.89, "elapsed_time": "2:26:54", "remaining_time": "2:39:50"} -{"current_steps": 887, "total_steps": 1850, "loss": 0.121, "lr": 2.6654494134940586e-06, "epoch": 4.794594594594595, "percentage": 47.95, "elapsed_time": "2:26:59", "remaining_time": "2:39:35"} -{"current_steps": 888, "total_steps": 1850, "loss": 0.1853, "lr": 2.6612130886070313e-06, "epoch": 4.8, "percentage": 48.0, "elapsed_time": "2:27:02", "remaining_time": "2:39:17"} -{"current_steps": 889, "total_steps": 1850, "loss": 0.0533, "lr": 2.6569762988232838e-06, "epoch": 4.805405405405406, "percentage": 48.05, "elapsed_time": "2:27:05", "remaining_time": "2:39:00"} -{"current_steps": 890, "total_steps": 1850, "loss": 0.3178, "lr": 2.652739056360618e-06, "epoch": 4.8108108108108105, "percentage": 48.11, "elapsed_time": "2:27:08", "remaining_time": "2:38:42"} -{"current_steps": 891, "total_steps": 1850, "loss": 0.1735, "lr": 2.648501373438142e-06, "epoch": 4.816216216216216, "percentage": 48.16, "elapsed_time": "2:27:10", "remaining_time": "2:38:23"} -{"current_steps": 892, "total_steps": 1850, "loss": 0.062, "lr": 2.644263262276234e-06, "epoch": 4.821621621621622, "percentage": 48.22, "elapsed_time": "2:27:13", "remaining_time": "2:38:07"} -{"current_steps": 893, "total_steps": 1850, "loss": 0.1336, "lr": 2.640024735096507e-06, "epoch": 4.827027027027027, "percentage": 48.27, "elapsed_time": "2:27:14", "remaining_time": "2:37:47"} -{"current_steps": 894, "total_steps": 1850, "loss": 0.1404, "lr": 2.6357858041217733e-06, "epoch": 4.832432432432433, "percentage": 48.32, "elapsed_time": "2:27:19", "remaining_time": "2:37:32"} -{"current_steps": 895, "total_steps": 1850, "loss": 0.0373, "lr": 2.6315464815760104e-06, "epoch": 4.837837837837838, "percentage": 48.38, "elapsed_time": "2:27:20", "remaining_time": "2:37:13"} -{"current_steps": 896, "total_steps": 1850, "loss": 0.3068, "lr": 2.6273067796843242e-06, "epoch": 4.8432432432432435, "percentage": 48.43, "elapsed_time": "2:27:23", "remaining_time": "2:36:55"} -{"current_steps": 897, "total_steps": 1850, "loss": 0.2221, "lr": 2.6230667106729157e-06, "epoch": 4.848648648648648, "percentage": 48.49, "elapsed_time": "2:27:26", "remaining_time": "2:36:38"} -{"current_steps": 898, "total_steps": 1850, "loss": 0.1431, "lr": 2.618826286769043e-06, "epoch": 4.854054054054054, "percentage": 48.54, "elapsed_time": "2:27:33", "remaining_time": "2:36:26"} -{"current_steps": 899, "total_steps": 1850, "loss": 0.196, "lr": 2.614585520200989e-06, "epoch": 4.859459459459459, "percentage": 48.59, "elapsed_time": "2:27:36", "remaining_time": "2:36:08"} -{"current_steps": 900, "total_steps": 1850, "loss": 0.2509, "lr": 2.6103444231980233e-06, "epoch": 4.864864864864865, "percentage": 48.65, "elapsed_time": "2:27:37", "remaining_time": "2:35:49"} -{"current_steps": 901, "total_steps": 1850, "loss": 0.0747, "lr": 2.606103007990371e-06, "epoch": 4.87027027027027, "percentage": 48.7, "elapsed_time": "2:27:40", "remaining_time": "2:35:33"} -{"current_steps": 902, "total_steps": 1850, "loss": 0.0494, "lr": 2.601861286809172e-06, "epoch": 4.875675675675676, "percentage": 48.76, "elapsed_time": "2:27:41", "remaining_time": "2:35:13"} -{"current_steps": 903, "total_steps": 1850, "loss": 0.0901, "lr": 2.5976192718864497e-06, "epoch": 4.881081081081081, "percentage": 48.81, "elapsed_time": "2:27:44", "remaining_time": "2:34:56"} -{"current_steps": 904, "total_steps": 1850, "loss": 0.0465, "lr": 2.593376975455075e-06, "epoch": 4.886486486486486, "percentage": 48.86, "elapsed_time": "2:27:45", "remaining_time": "2:34:37"} -{"current_steps": 905, "total_steps": 1850, "loss": 0.0616, "lr": 2.5891344097487294e-06, "epoch": 4.891891891891892, "percentage": 48.92, "elapsed_time": "2:27:46", "remaining_time": "2:34:18"} -{"current_steps": 906, "total_steps": 1850, "loss": 0.087, "lr": 2.584891587001872e-06, "epoch": 4.897297297297297, "percentage": 48.97, "elapsed_time": "2:27:50", "remaining_time": "2:34:02"} -{"current_steps": 907, "total_steps": 1850, "loss": 0.053, "lr": 2.580648519449704e-06, "epoch": 4.902702702702703, "percentage": 49.03, "elapsed_time": "2:27:52", "remaining_time": "2:33:44"} -{"current_steps": 908, "total_steps": 1850, "loss": 0.2707, "lr": 2.5764052193281287e-06, "epoch": 4.908108108108108, "percentage": 49.08, "elapsed_time": "2:27:55", "remaining_time": "2:33:27"} -{"current_steps": 909, "total_steps": 1850, "loss": 0.3679, "lr": 2.5721616988737254e-06, "epoch": 4.9135135135135135, "percentage": 49.14, "elapsed_time": "2:27:58", "remaining_time": "2:33:10"} -{"current_steps": 910, "total_steps": 1850, "loss": 0.1929, "lr": 2.567917970323704e-06, "epoch": 4.918918918918919, "percentage": 49.19, "elapsed_time": "2:28:00", "remaining_time": "2:32:53"} -{"current_steps": 911, "total_steps": 1850, "loss": 0.2461, "lr": 2.5636740459158776e-06, "epoch": 4.924324324324324, "percentage": 49.24, "elapsed_time": "2:28:04", "remaining_time": "2:32:37"} -{"current_steps": 912, "total_steps": 1850, "loss": 0.2484, "lr": 2.559429937888624e-06, "epoch": 4.92972972972973, "percentage": 49.3, "elapsed_time": "2:28:06", "remaining_time": "2:32:19"} -{"current_steps": 913, "total_steps": 1850, "loss": 0.1886, "lr": 2.5551856584808483e-06, "epoch": 4.935135135135135, "percentage": 49.35, "elapsed_time": "2:28:08", "remaining_time": "2:32:02"} -{"current_steps": 914, "total_steps": 1850, "loss": 0.1789, "lr": 2.5509412199319515e-06, "epoch": 4.940540540540541, "percentage": 49.41, "elapsed_time": "2:28:10", "remaining_time": "2:31:44"} -{"current_steps": 915, "total_steps": 1850, "loss": 0.1072, "lr": 2.5466966344817927e-06, "epoch": 4.945945945945946, "percentage": 49.46, "elapsed_time": "2:28:14", "remaining_time": "2:31:28"} -{"current_steps": 916, "total_steps": 1850, "loss": 0.2624, "lr": 2.542451914370656e-06, "epoch": 4.951351351351351, "percentage": 49.51, "elapsed_time": "2:28:17", "remaining_time": "2:31:12"} -{"current_steps": 917, "total_steps": 1850, "loss": 0.0639, "lr": 2.538207071839213e-06, "epoch": 4.956756756756757, "percentage": 49.57, "elapsed_time": "2:28:18", "remaining_time": "2:30:53"} -{"current_steps": 918, "total_steps": 1850, "loss": 0.1281, "lr": 2.533962119128487e-06, "epoch": 4.962162162162162, "percentage": 49.62, "elapsed_time": "2:28:22", "remaining_time": "2:30:37"} -{"current_steps": 919, "total_steps": 1850, "loss": 0.1771, "lr": 2.529717068479821e-06, "epoch": 4.967567567567568, "percentage": 49.68, "elapsed_time": "2:28:26", "remaining_time": "2:30:22"} -{"current_steps": 920, "total_steps": 1850, "loss": 0.2582, "lr": 2.5254719321348392e-06, "epoch": 4.972972972972973, "percentage": 49.73, "elapsed_time": "2:28:33", "remaining_time": "2:30:10"} -{"current_steps": 921, "total_steps": 1850, "loss": 0.3016, "lr": 2.5212267223354143e-06, "epoch": 4.978378378378379, "percentage": 49.78, "elapsed_time": "2:28:37", "remaining_time": "2:29:55"} -{"current_steps": 922, "total_steps": 1850, "loss": 0.2775, "lr": 2.5169814513236296e-06, "epoch": 4.9837837837837835, "percentage": 49.84, "elapsed_time": "2:28:42", "remaining_time": "2:29:40"} -{"current_steps": 923, "total_steps": 1850, "loss": 0.1246, "lr": 2.5127361313417447e-06, "epoch": 4.989189189189189, "percentage": 49.89, "elapsed_time": "2:28:44", "remaining_time": "2:29:22"} -{"current_steps": 924, "total_steps": 1850, "loss": 0.1732, "lr": 2.508490774632162e-06, "epoch": 4.994594594594595, "percentage": 49.95, "elapsed_time": "2:28:45", "remaining_time": "2:29:04"} -{"current_steps": 925, "total_steps": 1850, "loss": 0.1107, "lr": 2.5042453934373874e-06, "epoch": 5.0, "percentage": 50.0, "elapsed_time": "2:28:47", "remaining_time": "2:28:47"} -{"current_steps": 926, "total_steps": 1850, "loss": 0.1074, "lr": 2.5e-06, "epoch": 5.005405405405406, "percentage": 50.05, "elapsed_time": "2:34:50", "remaining_time": "2:34:30"} -{"current_steps": 927, "total_steps": 1850, "loss": 0.0752, "lr": 2.4957546065626134e-06, "epoch": 5.010810810810811, "percentage": 50.11, "elapsed_time": "2:34:53", "remaining_time": "2:34:13"} -{"current_steps": 928, "total_steps": 1850, "loss": 0.0313, "lr": 2.491509225367839e-06, "epoch": 5.0162162162162165, "percentage": 50.16, "elapsed_time": "2:34:57", "remaining_time": "2:33:57"} -{"current_steps": 929, "total_steps": 1850, "loss": 0.0851, "lr": 2.487263868658256e-06, "epoch": 5.021621621621621, "percentage": 50.22, "elapsed_time": "2:35:04", "remaining_time": "2:33:44"} -{"current_steps": 930, "total_steps": 1850, "loss": 0.0443, "lr": 2.483018548676371e-06, "epoch": 5.027027027027027, "percentage": 50.27, "elapsed_time": "2:35:07", "remaining_time": "2:33:27"} -{"current_steps": 931, "total_steps": 1850, "loss": 0.056, "lr": 2.478773277664587e-06, "epoch": 5.032432432432432, "percentage": 50.32, "elapsed_time": "2:35:09", "remaining_time": "2:33:09"} -{"current_steps": 932, "total_steps": 1850, "loss": 0.1668, "lr": 2.4745280678651616e-06, "epoch": 5.037837837837838, "percentage": 50.38, "elapsed_time": "2:35:13", "remaining_time": "2:32:53"} -{"current_steps": 933, "total_steps": 1850, "loss": 0.0502, "lr": 2.47028293152018e-06, "epoch": 5.043243243243243, "percentage": 50.43, "elapsed_time": "2:35:16", "remaining_time": "2:32:36"} -{"current_steps": 934, "total_steps": 1850, "loss": 0.023, "lr": 2.4660378808715147e-06, "epoch": 5.048648648648649, "percentage": 50.49, "elapsed_time": "2:35:17", "remaining_time": "2:32:18"} -{"current_steps": 935, "total_steps": 1850, "loss": 0.1418, "lr": 2.4617929281607885e-06, "epoch": 5.054054054054054, "percentage": 50.54, "elapsed_time": "2:35:21", "remaining_time": "2:32:01"} -{"current_steps": 936, "total_steps": 1850, "loss": 0.1167, "lr": 2.457548085629345e-06, "epoch": 5.059459459459459, "percentage": 50.59, "elapsed_time": "2:35:23", "remaining_time": "2:31:44"} -{"current_steps": 937, "total_steps": 1850, "loss": 0.0781, "lr": 2.4533033655182072e-06, "epoch": 5.064864864864865, "percentage": 50.65, "elapsed_time": "2:35:24", "remaining_time": "2:31:25"} -{"current_steps": 938, "total_steps": 1850, "loss": 0.0799, "lr": 2.449058780068049e-06, "epoch": 5.07027027027027, "percentage": 50.7, "elapsed_time": "2:35:26", "remaining_time": "2:31:07"} -{"current_steps": 939, "total_steps": 1850, "loss": 0.0548, "lr": 2.444814341519152e-06, "epoch": 5.075675675675676, "percentage": 50.76, "elapsed_time": "2:35:30", "remaining_time": "2:30:52"} -{"current_steps": 940, "total_steps": 1850, "loss": 0.1218, "lr": 2.440570062111376e-06, "epoch": 5.081081081081081, "percentage": 50.81, "elapsed_time": "2:35:34", "remaining_time": "2:30:36"} -{"current_steps": 941, "total_steps": 1850, "loss": 0.0182, "lr": 2.436325954084122e-06, "epoch": 5.0864864864864865, "percentage": 50.86, "elapsed_time": "2:35:34", "remaining_time": "2:30:17"} -{"current_steps": 942, "total_steps": 1850, "loss": 0.0337, "lr": 2.4320820296762964e-06, "epoch": 5.091891891891892, "percentage": 50.92, "elapsed_time": "2:35:38", "remaining_time": "2:30:01"} -{"current_steps": 943, "total_steps": 1850, "loss": 0.0226, "lr": 2.4278383011262755e-06, "epoch": 5.097297297297297, "percentage": 50.97, "elapsed_time": "2:35:41", "remaining_time": "2:29:44"} -{"current_steps": 944, "total_steps": 1850, "loss": 0.0207, "lr": 2.4235947806718717e-06, "epoch": 5.102702702702703, "percentage": 51.03, "elapsed_time": "2:35:43", "remaining_time": "2:29:27"} -{"current_steps": 945, "total_steps": 1850, "loss": 0.1561, "lr": 2.4193514805502972e-06, "epoch": 5.108108108108108, "percentage": 51.08, "elapsed_time": "2:35:47", "remaining_time": "2:29:11"} -{"current_steps": 946, "total_steps": 1850, "loss": 0.1727, "lr": 2.4151084129981284e-06, "epoch": 5.113513513513514, "percentage": 51.14, "elapsed_time": "2:35:50", "remaining_time": "2:28:55"} -{"current_steps": 947, "total_steps": 1850, "loss": 0.1246, "lr": 2.4108655902512715e-06, "epoch": 5.118918918918919, "percentage": 51.19, "elapsed_time": "2:35:54", "remaining_time": "2:28:39"} -{"current_steps": 948, "total_steps": 1850, "loss": 0.1429, "lr": 2.406623024544926e-06, "epoch": 5.124324324324324, "percentage": 51.24, "elapsed_time": "2:35:56", "remaining_time": "2:28:22"} -{"current_steps": 949, "total_steps": 1850, "loss": 0.1569, "lr": 2.402380728113551e-06, "epoch": 5.12972972972973, "percentage": 51.3, "elapsed_time": "2:35:59", "remaining_time": "2:28:06"} -{"current_steps": 950, "total_steps": 1850, "loss": 0.1105, "lr": 2.3981387131908286e-06, "epoch": 5.135135135135135, "percentage": 51.35, "elapsed_time": "2:36:02", "remaining_time": "2:27:50"} -{"current_steps": 951, "total_steps": 1850, "loss": 0.3786, "lr": 2.39389699200963e-06, "epoch": 5.140540540540541, "percentage": 51.41, "elapsed_time": "2:36:06", "remaining_time": "2:27:34"} -{"current_steps": 952, "total_steps": 1850, "loss": 0.0826, "lr": 2.389655576801977e-06, "epoch": 5.145945945945946, "percentage": 51.46, "elapsed_time": "2:36:07", "remaining_time": "2:27:16"} -{"current_steps": 953, "total_steps": 1850, "loss": 0.0684, "lr": 2.3854144797990123e-06, "epoch": 5.151351351351352, "percentage": 51.51, "elapsed_time": "2:36:09", "remaining_time": "2:26:59"} -{"current_steps": 954, "total_steps": 1850, "loss": 0.0452, "lr": 2.3811737132309584e-06, "epoch": 5.1567567567567565, "percentage": 51.57, "elapsed_time": "2:36:11", "remaining_time": "2:26:41"} -{"current_steps": 955, "total_steps": 1850, "loss": 0.0465, "lr": 2.3769332893270856e-06, "epoch": 5.162162162162162, "percentage": 51.62, "elapsed_time": "2:36:14", "remaining_time": "2:26:25"} -{"current_steps": 956, "total_steps": 1850, "loss": 0.0551, "lr": 2.372693220315677e-06, "epoch": 5.167567567567567, "percentage": 51.68, "elapsed_time": "2:36:19", "remaining_time": "2:26:11"} -{"current_steps": 957, "total_steps": 1850, "loss": 0.0896, "lr": 2.36845351842399e-06, "epoch": 5.172972972972973, "percentage": 51.73, "elapsed_time": "2:36:26", "remaining_time": "2:25:59"} -{"current_steps": 958, "total_steps": 1850, "loss": 0.0565, "lr": 2.3642141958782267e-06, "epoch": 5.178378378378379, "percentage": 51.78, "elapsed_time": "2:36:32", "remaining_time": "2:25:45"} -{"current_steps": 959, "total_steps": 1850, "loss": 0.1563, "lr": 2.3599752649034935e-06, "epoch": 5.183783783783784, "percentage": 51.84, "elapsed_time": "2:36:34", "remaining_time": "2:25:28"} -{"current_steps": 960, "total_steps": 1850, "loss": 0.0236, "lr": 2.3557367377237663e-06, "epoch": 5.1891891891891895, "percentage": 51.89, "elapsed_time": "2:36:37", "remaining_time": "2:25:12"} -{"current_steps": 961, "total_steps": 1850, "loss": 0.0506, "lr": 2.351498626561858e-06, "epoch": 5.194594594594594, "percentage": 51.95, "elapsed_time": "2:36:42", "remaining_time": "2:24:58"} -{"current_steps": 962, "total_steps": 1850, "loss": 0.1001, "lr": 2.3472609436393827e-06, "epoch": 5.2, "percentage": 52.0, "elapsed_time": "2:36:44", "remaining_time": "2:24:41"} -{"current_steps": 963, "total_steps": 1850, "loss": 0.0951, "lr": 2.3430237011767166e-06, "epoch": 5.205405405405405, "percentage": 52.05, "elapsed_time": "2:36:48", "remaining_time": "2:24:25"} -{"current_steps": 964, "total_steps": 1850, "loss": 0.0824, "lr": 2.3387869113929695e-06, "epoch": 5.210810810810811, "percentage": 52.11, "elapsed_time": "2:36:51", "remaining_time": "2:24:09"} -{"current_steps": 965, "total_steps": 1850, "loss": 0.0485, "lr": 2.3345505865059427e-06, "epoch": 5.216216216216216, "percentage": 52.16, "elapsed_time": "2:36:54", "remaining_time": "2:23:53"} -{"current_steps": 966, "total_steps": 1850, "loss": 0.1516, "lr": 2.3303147387320985e-06, "epoch": 5.221621621621622, "percentage": 52.22, "elapsed_time": "2:36:57", "remaining_time": "2:23:37"} -{"current_steps": 967, "total_steps": 1850, "loss": 0.1664, "lr": 2.3260793802865227e-06, "epoch": 5.227027027027027, "percentage": 52.27, "elapsed_time": "2:37:00", "remaining_time": "2:23:22"} -{"current_steps": 968, "total_steps": 1850, "loss": 0.1127, "lr": 2.3218445233828904e-06, "epoch": 5.232432432432432, "percentage": 52.32, "elapsed_time": "2:37:03", "remaining_time": "2:23:06"} -{"current_steps": 969, "total_steps": 1850, "loss": 0.0445, "lr": 2.31761018023343e-06, "epoch": 5.237837837837838, "percentage": 52.38, "elapsed_time": "2:37:05", "remaining_time": "2:22:49"} -{"current_steps": 970, "total_steps": 1850, "loss": 0.1402, "lr": 2.3133763630488883e-06, "epoch": 5.243243243243243, "percentage": 52.43, "elapsed_time": "2:37:06", "remaining_time": "2:22:31"} -{"current_steps": 971, "total_steps": 1850, "loss": 0.0332, "lr": 2.3091430840384964e-06, "epoch": 5.248648648648649, "percentage": 52.49, "elapsed_time": "2:37:09", "remaining_time": "2:22:16"} -{"current_steps": 972, "total_steps": 1850, "loss": 0.1266, "lr": 2.304910355409932e-06, "epoch": 5.254054054054054, "percentage": 52.54, "elapsed_time": "2:37:12", "remaining_time": "2:22:00"} -{"current_steps": 973, "total_steps": 1850, "loss": 0.0281, "lr": 2.3006781893692863e-06, "epoch": 5.2594594594594595, "percentage": 52.59, "elapsed_time": "2:37:14", "remaining_time": "2:21:43"} -{"current_steps": 974, "total_steps": 1850, "loss": 0.0238, "lr": 2.2964465981210283e-06, "epoch": 5.264864864864865, "percentage": 52.65, "elapsed_time": "2:37:15", "remaining_time": "2:21:26"} -{"current_steps": 975, "total_steps": 1850, "loss": 0.0828, "lr": 2.2922155938679695e-06, "epoch": 5.27027027027027, "percentage": 52.7, "elapsed_time": "2:37:18", "remaining_time": "2:21:10"} -{"current_steps": 976, "total_steps": 1850, "loss": 0.1874, "lr": 2.287985188811228e-06, "epoch": 5.275675675675676, "percentage": 52.76, "elapsed_time": "2:37:20", "remaining_time": "2:20:53"} -{"current_steps": 977, "total_steps": 1850, "loss": 0.0413, "lr": 2.2837553951501935e-06, "epoch": 5.281081081081081, "percentage": 52.81, "elapsed_time": "2:37:24", "remaining_time": "2:20:39"} -{"current_steps": 978, "total_steps": 1850, "loss": 0.0909, "lr": 2.279526225082495e-06, "epoch": 5.286486486486487, "percentage": 52.86, "elapsed_time": "2:37:30", "remaining_time": "2:20:26"} -{"current_steps": 979, "total_steps": 1850, "loss": 0.0798, "lr": 2.275297690803962e-06, "epoch": 5.291891891891892, "percentage": 52.92, "elapsed_time": "2:37:31", "remaining_time": "2:20:08"} -{"current_steps": 980, "total_steps": 1850, "loss": 0.1456, "lr": 2.271069804508589e-06, "epoch": 5.297297297297297, "percentage": 52.97, "elapsed_time": "2:37:35", "remaining_time": "2:19:54"} -{"current_steps": 981, "total_steps": 1850, "loss": 0.085, "lr": 2.266842578388504e-06, "epoch": 5.302702702702703, "percentage": 53.03, "elapsed_time": "2:37:42", "remaining_time": "2:19:42"} -{"current_steps": 982, "total_steps": 1850, "loss": 0.0885, "lr": 2.2626160246339303e-06, "epoch": 5.308108108108108, "percentage": 53.08, "elapsed_time": "2:37:49", "remaining_time": "2:19:29"} -{"current_steps": 983, "total_steps": 1850, "loss": 0.1543, "lr": 2.2583901554331513e-06, "epoch": 5.313513513513514, "percentage": 53.14, "elapsed_time": "2:37:54", "remaining_time": "2:19:16"} -{"current_steps": 984, "total_steps": 1850, "loss": 0.06, "lr": 2.2541649829724783e-06, "epoch": 5.318918918918919, "percentage": 53.19, "elapsed_time": "2:37:57", "remaining_time": "2:19:00"} -{"current_steps": 985, "total_steps": 1850, "loss": 0.0518, "lr": 2.249940519436212e-06, "epoch": 5.324324324324325, "percentage": 53.24, "elapsed_time": "2:37:59", "remaining_time": "2:18:44"} -{"current_steps": 986, "total_steps": 1850, "loss": 0.1542, "lr": 2.2457167770066104e-06, "epoch": 5.3297297297297295, "percentage": 53.3, "elapsed_time": "2:38:03", "remaining_time": "2:18:30"} -{"current_steps": 987, "total_steps": 1850, "loss": 0.0338, "lr": 2.2414937678638495e-06, "epoch": 5.335135135135135, "percentage": 53.35, "elapsed_time": "2:38:07", "remaining_time": "2:18:15"} -{"current_steps": 988, "total_steps": 1850, "loss": 0.0204, "lr": 2.2372715041859925e-06, "epoch": 5.34054054054054, "percentage": 53.41, "elapsed_time": "2:38:08", "remaining_time": "2:17:58"} -{"current_steps": 989, "total_steps": 1850, "loss": 0.129, "lr": 2.2330499981489524e-06, "epoch": 5.345945945945946, "percentage": 53.46, "elapsed_time": "2:38:12", "remaining_time": "2:17:43"} -{"current_steps": 990, "total_steps": 1850, "loss": 0.0307, "lr": 2.2288292619264566e-06, "epoch": 5.351351351351352, "percentage": 53.51, "elapsed_time": "2:38:14", "remaining_time": "2:17:27"} -{"current_steps": 991, "total_steps": 1850, "loss": 0.0374, "lr": 2.2246093076900145e-06, "epoch": 5.356756756756757, "percentage": 53.57, "elapsed_time": "2:38:18", "remaining_time": "2:17:13"} -{"current_steps": 992, "total_steps": 1850, "loss": 0.0265, "lr": 2.220390147608878e-06, "epoch": 5.3621621621621625, "percentage": 53.62, "elapsed_time": "2:38:21", "remaining_time": "2:16:58"} -{"current_steps": 993, "total_steps": 1850, "loss": 0.0468, "lr": 2.2161717938500112e-06, "epoch": 5.367567567567567, "percentage": 53.68, "elapsed_time": "2:38:24", "remaining_time": "2:16:43"} -{"current_steps": 994, "total_steps": 1850, "loss": 0.1118, "lr": 2.2119542585780513e-06, "epoch": 5.372972972972973, "percentage": 53.73, "elapsed_time": "2:38:31", "remaining_time": "2:16:30"} -{"current_steps": 995, "total_steps": 1850, "loss": 0.2056, "lr": 2.2077375539552764e-06, "epoch": 5.378378378378378, "percentage": 53.78, "elapsed_time": "2:38:35", "remaining_time": "2:16:16"} -{"current_steps": 996, "total_steps": 1850, "loss": 0.0437, "lr": 2.203521692141568e-06, "epoch": 5.383783783783784, "percentage": 53.84, "elapsed_time": "2:38:37", "remaining_time": "2:16:00"} -{"current_steps": 997, "total_steps": 1850, "loss": 0.1981, "lr": 2.199306685294377e-06, "epoch": 5.389189189189189, "percentage": 53.89, "elapsed_time": "2:38:40", "remaining_time": "2:15:45"} -{"current_steps": 998, "total_steps": 1850, "loss": 0.0756, "lr": 2.1950925455686906e-06, "epoch": 5.394594594594595, "percentage": 53.95, "elapsed_time": "2:38:42", "remaining_time": "2:15:29"} -{"current_steps": 999, "total_steps": 1850, "loss": 0.0998, "lr": 2.1908792851169954e-06, "epoch": 5.4, "percentage": 54.0, "elapsed_time": "2:38:48", "remaining_time": "2:15:16"} -{"current_steps": 1000, "total_steps": 1850, "loss": 0.0223, "lr": 2.186666916089239e-06, "epoch": 5.405405405405405, "percentage": 54.05, "elapsed_time": "2:38:50", "remaining_time": "2:15:00"} -{"current_steps": 1001, "total_steps": 1850, "loss": 0.0489, "lr": 2.1824554506328033e-06, "epoch": 5.410810810810811, "percentage": 54.11, "elapsed_time": "2:38:55", "remaining_time": "2:14:47"} -{"current_steps": 1002, "total_steps": 1850, "loss": 0.0321, "lr": 2.17824490089246e-06, "epoch": 5.416216216216216, "percentage": 54.16, "elapsed_time": "2:38:58", "remaining_time": "2:14:32"} -{"current_steps": 1003, "total_steps": 1850, "loss": 0.0167, "lr": 2.174035279010343e-06, "epoch": 5.421621621621622, "percentage": 54.22, "elapsed_time": "2:39:00", "remaining_time": "2:14:16"} -{"current_steps": 1004, "total_steps": 1850, "loss": 0.0588, "lr": 2.1698265971259104e-06, "epoch": 5.427027027027027, "percentage": 54.27, "elapsed_time": "2:39:05", "remaining_time": "2:14:03"} -{"current_steps": 1005, "total_steps": 1850, "loss": 0.0868, "lr": 2.1656188673759065e-06, "epoch": 5.4324324324324325, "percentage": 54.32, "elapsed_time": "2:39:13", "remaining_time": "2:13:52"} -{"current_steps": 1006, "total_steps": 1850, "loss": 0.1131, "lr": 2.1614121018943346e-06, "epoch": 5.437837837837838, "percentage": 54.38, "elapsed_time": "2:39:16", "remaining_time": "2:13:37"} -{"current_steps": 1007, "total_steps": 1850, "loss": 0.0285, "lr": 2.1572063128124133e-06, "epoch": 5.443243243243243, "percentage": 54.43, "elapsed_time": "2:39:20", "remaining_time": "2:13:23"} -{"current_steps": 1008, "total_steps": 1850, "loss": 0.0303, "lr": 2.153001512258548e-06, "epoch": 5.448648648648649, "percentage": 54.49, "elapsed_time": "2:39:21", "remaining_time": "2:13:07"} -{"current_steps": 1009, "total_steps": 1850, "loss": 0.3278, "lr": 2.1487977123582922e-06, "epoch": 5.454054054054054, "percentage": 54.54, "elapsed_time": "2:39:25", "remaining_time": "2:12:52"} -{"current_steps": 1010, "total_steps": 1850, "loss": 0.0346, "lr": 2.144594925234314e-06, "epoch": 5.45945945945946, "percentage": 54.59, "elapsed_time": "2:39:28", "remaining_time": "2:12:37"} -{"current_steps": 1011, "total_steps": 1850, "loss": 0.0874, "lr": 2.140393163006362e-06, "epoch": 5.464864864864865, "percentage": 54.65, "elapsed_time": "2:39:34", "remaining_time": "2:12:25"} -{"current_steps": 1012, "total_steps": 1850, "loss": 0.0194, "lr": 2.1361924377912266e-06, "epoch": 5.47027027027027, "percentage": 54.7, "elapsed_time": "2:39:35", "remaining_time": "2:12:09"} -{"current_steps": 1013, "total_steps": 1850, "loss": 0.1193, "lr": 2.1319927617027112e-06, "epoch": 5.475675675675676, "percentage": 54.76, "elapsed_time": "2:39:37", "remaining_time": "2:11:53"} -{"current_steps": 1014, "total_steps": 1850, "loss": 0.0331, "lr": 2.1277941468515908e-06, "epoch": 5.481081081081081, "percentage": 54.81, "elapsed_time": "2:39:40", "remaining_time": "2:11:38"} -{"current_steps": 1015, "total_steps": 1850, "loss": 0.0723, "lr": 2.123596605345582e-06, "epoch": 5.486486486486487, "percentage": 54.86, "elapsed_time": "2:39:42", "remaining_time": "2:11:22"} -{"current_steps": 1016, "total_steps": 1850, "loss": 0.0751, "lr": 2.119400149289305e-06, "epoch": 5.491891891891892, "percentage": 54.92, "elapsed_time": "2:39:44", "remaining_time": "2:11:07"} -{"current_steps": 1017, "total_steps": 1850, "loss": 0.0265, "lr": 2.11520479078425e-06, "epoch": 5.4972972972972975, "percentage": 54.97, "elapsed_time": "2:39:46", "remaining_time": "2:10:52"} -{"current_steps": 1018, "total_steps": 1850, "loss": 0.1023, "lr": 2.111010541928743e-06, "epoch": 5.5027027027027025, "percentage": 55.03, "elapsed_time": "2:39:48", "remaining_time": "2:10:36"} -{"current_steps": 1019, "total_steps": 1850, "loss": 0.0831, "lr": 2.10681741481791e-06, "epoch": 5.508108108108108, "percentage": 55.08, "elapsed_time": "2:39:52", "remaining_time": "2:10:22"} -{"current_steps": 1020, "total_steps": 1850, "loss": 0.1258, "lr": 2.1026254215436408e-06, "epoch": 5.513513513513513, "percentage": 55.14, "elapsed_time": "2:39:55", "remaining_time": "2:10:08"} -{"current_steps": 1021, "total_steps": 1850, "loss": 0.0926, "lr": 2.098434574194557e-06, "epoch": 5.518918918918919, "percentage": 55.19, "elapsed_time": "2:39:58", "remaining_time": "2:09:53"} -{"current_steps": 1022, "total_steps": 1850, "loss": 0.0306, "lr": 2.094244884855974e-06, "epoch": 5.524324324324324, "percentage": 55.24, "elapsed_time": "2:40:01", "remaining_time": "2:09:38"} -{"current_steps": 1023, "total_steps": 1850, "loss": 0.1374, "lr": 2.0900563656098706e-06, "epoch": 5.52972972972973, "percentage": 55.3, "elapsed_time": "2:40:04", "remaining_time": "2:09:24"} -{"current_steps": 1024, "total_steps": 1850, "loss": 0.1173, "lr": 2.085869028534848e-06, "epoch": 5.535135135135135, "percentage": 55.35, "elapsed_time": "2:40:07", "remaining_time": "2:09:09"} -{"current_steps": 1025, "total_steps": 1850, "loss": 0.146, "lr": 2.0816828857061e-06, "epoch": 5.54054054054054, "percentage": 55.41, "elapsed_time": "2:40:12", "remaining_time": "2:08:57"} -{"current_steps": 1026, "total_steps": 1850, "loss": 0.1542, "lr": 2.077497949195378e-06, "epoch": 5.545945945945946, "percentage": 55.46, "elapsed_time": "2:40:14", "remaining_time": "2:08:41"} -{"current_steps": 1027, "total_steps": 1850, "loss": 0.0699, "lr": 2.073314231070951e-06, "epoch": 5.551351351351351, "percentage": 55.51, "elapsed_time": "2:40:20", "remaining_time": "2:08:29"} -{"current_steps": 1028, "total_steps": 1850, "loss": 0.1429, "lr": 2.069131743397578e-06, "epoch": 5.556756756756757, "percentage": 55.57, "elapsed_time": "2:40:21", "remaining_time": "2:08:13"} -{"current_steps": 1029, "total_steps": 1850, "loss": 0.1203, "lr": 2.0649504982364674e-06, "epoch": 5.562162162162162, "percentage": 55.62, "elapsed_time": "2:40:25", "remaining_time": "2:07:59"} -{"current_steps": 1030, "total_steps": 1850, "loss": 0.1078, "lr": 2.0607705076452465e-06, "epoch": 5.5675675675675675, "percentage": 55.68, "elapsed_time": "2:40:28", "remaining_time": "2:07:45"} -{"current_steps": 1031, "total_steps": 1850, "loss": 0.0881, "lr": 2.056591783677923e-06, "epoch": 5.572972972972973, "percentage": 55.73, "elapsed_time": "2:40:33", "remaining_time": "2:07:32"} -{"current_steps": 1032, "total_steps": 1850, "loss": 0.0586, "lr": 2.0524143383848525e-06, "epoch": 5.578378378378378, "percentage": 55.78, "elapsed_time": "2:40:35", "remaining_time": "2:07:17"} -{"current_steps": 1033, "total_steps": 1850, "loss": 0.3671, "lr": 2.048238183812704e-06, "epoch": 5.583783783783784, "percentage": 55.84, "elapsed_time": "2:40:37", "remaining_time": "2:07:02"} -{"current_steps": 1034, "total_steps": 1850, "loss": 0.048, "lr": 2.0440633320044224e-06, "epoch": 5.589189189189189, "percentage": 55.89, "elapsed_time": "2:40:40", "remaining_time": "2:06:47"} -{"current_steps": 1035, "total_steps": 1850, "loss": 0.2091, "lr": 2.0398897949991992e-06, "epoch": 5.594594594594595, "percentage": 55.95, "elapsed_time": "2:40:43", "remaining_time": "2:06:33"} -{"current_steps": 1036, "total_steps": 1850, "loss": 0.1295, "lr": 2.0357175848324306e-06, "epoch": 5.6, "percentage": 56.0, "elapsed_time": "2:40:45", "remaining_time": "2:06:18"} -{"current_steps": 1037, "total_steps": 1850, "loss": 0.0504, "lr": 2.031546713535688e-06, "epoch": 5.605405405405405, "percentage": 56.05, "elapsed_time": "2:40:46", "remaining_time": "2:06:02"} -{"current_steps": 1038, "total_steps": 1850, "loss": 0.1816, "lr": 2.027377193136684e-06, "epoch": 5.610810810810811, "percentage": 56.11, "elapsed_time": "2:40:50", "remaining_time": "2:05:49"} -{"current_steps": 1039, "total_steps": 1850, "loss": 0.0392, "lr": 2.0232090356592333e-06, "epoch": 5.616216216216216, "percentage": 56.16, "elapsed_time": "2:40:54", "remaining_time": "2:05:36"} -{"current_steps": 1040, "total_steps": 1850, "loss": 0.0273, "lr": 2.0190422531232186e-06, "epoch": 5.621621621621622, "percentage": 56.22, "elapsed_time": "2:40:56", "remaining_time": "2:05:20"} -{"current_steps": 1041, "total_steps": 1850, "loss": 0.0672, "lr": 2.014876857544562e-06, "epoch": 5.627027027027027, "percentage": 56.27, "elapsed_time": "2:40:59", "remaining_time": "2:05:06"} -{"current_steps": 1042, "total_steps": 1850, "loss": 0.0749, "lr": 2.0107128609351817e-06, "epoch": 5.632432432432433, "percentage": 56.32, "elapsed_time": "2:41:01", "remaining_time": "2:04:51"} -{"current_steps": 1043, "total_steps": 1850, "loss": 0.0713, "lr": 2.006550275302965e-06, "epoch": 5.6378378378378375, "percentage": 56.38, "elapsed_time": "2:41:04", "remaining_time": "2:04:37"} -{"current_steps": 1044, "total_steps": 1850, "loss": 0.0547, "lr": 2.002389112651728e-06, "epoch": 5.643243243243243, "percentage": 56.43, "elapsed_time": "2:41:07", "remaining_time": "2:04:23"} -{"current_steps": 1045, "total_steps": 1850, "loss": 0.0304, "lr": 1.9982293849811852e-06, "epoch": 5.648648648648649, "percentage": 56.49, "elapsed_time": "2:41:11", "remaining_time": "2:04:10"} -{"current_steps": 1046, "total_steps": 1850, "loss": 0.0227, "lr": 1.994071104286911e-06, "epoch": 5.654054054054054, "percentage": 56.54, "elapsed_time": "2:41:13", "remaining_time": "2:03:55"} -{"current_steps": 1047, "total_steps": 1850, "loss": 0.0811, "lr": 1.9899142825603078e-06, "epoch": 5.65945945945946, "percentage": 56.59, "elapsed_time": "2:41:16", "remaining_time": "2:03:41"} -{"current_steps": 1048, "total_steps": 1850, "loss": 0.0292, "lr": 1.9857589317885727e-06, "epoch": 5.664864864864865, "percentage": 56.65, "elapsed_time": "2:41:18", "remaining_time": "2:03:26"} -{"current_steps": 1049, "total_steps": 1850, "loss": 0.0386, "lr": 1.9816050639546566e-06, "epoch": 5.6702702702702705, "percentage": 56.7, "elapsed_time": "2:41:21", "remaining_time": "2:03:12"} -{"current_steps": 1050, "total_steps": 1850, "loss": 0.1448, "lr": 1.977452691037239e-06, "epoch": 5.675675675675675, "percentage": 56.76, "elapsed_time": "2:41:25", "remaining_time": "2:02:59"} -{"current_steps": 1051, "total_steps": 1850, "loss": 0.0451, "lr": 1.973301825010685e-06, "epoch": 5.681081081081081, "percentage": 56.81, "elapsed_time": "2:41:27", "remaining_time": "2:02:44"} -{"current_steps": 1052, "total_steps": 1850, "loss": 0.0708, "lr": 1.9691524778450145e-06, "epoch": 5.686486486486486, "percentage": 56.86, "elapsed_time": "2:41:31", "remaining_time": "2:02:31"} -{"current_steps": 1053, "total_steps": 1850, "loss": 0.0311, "lr": 1.96500466150587e-06, "epoch": 5.691891891891892, "percentage": 56.92, "elapsed_time": "2:41:34", "remaining_time": "2:02:17"} -{"current_steps": 1054, "total_steps": 1850, "loss": 0.0728, "lr": 1.960858387954476e-06, "epoch": 5.697297297297297, "percentage": 56.97, "elapsed_time": "2:41:36", "remaining_time": "2:02:03"} -{"current_steps": 1055, "total_steps": 1850, "loss": 0.1429, "lr": 1.956713669147611e-06, "epoch": 5.702702702702703, "percentage": 57.03, "elapsed_time": "2:41:43", "remaining_time": "2:01:51"} -{"current_steps": 1056, "total_steps": 1850, "loss": 0.0702, "lr": 1.9525705170375674e-06, "epoch": 5.708108108108108, "percentage": 57.08, "elapsed_time": "2:41:46", "remaining_time": "2:01:38"} -{"current_steps": 1057, "total_steps": 1850, "loss": 0.0934, "lr": 1.948428943572121e-06, "epoch": 5.713513513513513, "percentage": 57.14, "elapsed_time": "2:41:52", "remaining_time": "2:01:26"} -{"current_steps": 1058, "total_steps": 1850, "loss": 0.0327, "lr": 1.944288960694497e-06, "epoch": 5.718918918918919, "percentage": 57.19, "elapsed_time": "2:41:54", "remaining_time": "2:01:12"} -{"current_steps": 1059, "total_steps": 1850, "loss": 0.1025, "lr": 1.9401505803433308e-06, "epoch": 5.724324324324324, "percentage": 57.24, "elapsed_time": "2:41:59", "remaining_time": "2:00:59"} -{"current_steps": 1060, "total_steps": 1850, "loss": 0.0825, "lr": 1.9360138144526363e-06, "epoch": 5.72972972972973, "percentage": 57.3, "elapsed_time": "2:42:02", "remaining_time": "2:00:46"} -{"current_steps": 1061, "total_steps": 1850, "loss": 0.164, "lr": 1.9318786749517754e-06, "epoch": 5.735135135135135, "percentage": 57.35, "elapsed_time": "2:42:06", "remaining_time": "2:00:32"} -{"current_steps": 1062, "total_steps": 1850, "loss": 0.0574, "lr": 1.9277451737654154e-06, "epoch": 5.7405405405405405, "percentage": 57.41, "elapsed_time": "2:42:08", "remaining_time": "2:00:18"} -{"current_steps": 1063, "total_steps": 1850, "loss": 0.2916, "lr": 1.923613322813503e-06, "epoch": 5.745945945945946, "percentage": 57.46, "elapsed_time": "2:42:11", "remaining_time": "2:00:04"} -{"current_steps": 1064, "total_steps": 1850, "loss": 0.0626, "lr": 1.9194831340112228e-06, "epoch": 5.751351351351351, "percentage": 57.51, "elapsed_time": "2:42:15", "remaining_time": "1:59:51"} -{"current_steps": 1065, "total_steps": 1850, "loss": 0.0544, "lr": 1.915354619268969e-06, "epoch": 5.756756756756757, "percentage": 57.57, "elapsed_time": "2:42:19", "remaining_time": "1:59:39"} -{"current_steps": 1066, "total_steps": 1850, "loss": 0.0145, "lr": 1.9112277904923064e-06, "epoch": 5.762162162162162, "percentage": 57.62, "elapsed_time": "2:42:20", "remaining_time": "1:59:24"} -{"current_steps": 1067, "total_steps": 1850, "loss": 0.0335, "lr": 1.9071026595819387e-06, "epoch": 5.767567567567568, "percentage": 57.68, "elapsed_time": "2:42:22", "remaining_time": "1:59:09"} -{"current_steps": 1068, "total_steps": 1850, "loss": 0.1385, "lr": 1.902979238433673e-06, "epoch": 5.772972972972973, "percentage": 57.73, "elapsed_time": "2:42:25", "remaining_time": "1:58:55"} -{"current_steps": 1069, "total_steps": 1850, "loss": 0.0523, "lr": 1.8988575389383853e-06, "epoch": 5.778378378378378, "percentage": 57.78, "elapsed_time": "2:42:28", "remaining_time": "1:58:42"} -{"current_steps": 1070, "total_steps": 1850, "loss": 0.171, "lr": 1.8947375729819894e-06, "epoch": 5.783783783783784, "percentage": 57.84, "elapsed_time": "2:42:32", "remaining_time": "1:58:29"} -{"current_steps": 1071, "total_steps": 1850, "loss": 0.0431, "lr": 1.8906193524453964e-06, "epoch": 5.789189189189189, "percentage": 57.89, "elapsed_time": "2:42:35", "remaining_time": "1:58:15"} -{"current_steps": 1072, "total_steps": 1850, "loss": 0.0157, "lr": 1.886502889204487e-06, "epoch": 5.794594594594595, "percentage": 57.95, "elapsed_time": "2:42:37", "remaining_time": "1:58:01"} -{"current_steps": 1073, "total_steps": 1850, "loss": 0.0892, "lr": 1.882388195130073e-06, "epoch": 5.8, "percentage": 58.0, "elapsed_time": "2:42:39", "remaining_time": "1:57:46"} -{"current_steps": 1074, "total_steps": 1850, "loss": 0.0376, "lr": 1.8782752820878636e-06, "epoch": 5.805405405405406, "percentage": 58.05, "elapsed_time": "2:42:43", "remaining_time": "1:57:34"} -{"current_steps": 1075, "total_steps": 1850, "loss": 0.1174, "lr": 1.8741641619384343e-06, "epoch": 5.8108108108108105, "percentage": 58.11, "elapsed_time": "2:42:44", "remaining_time": "1:57:19"} -{"current_steps": 1076, "total_steps": 1850, "loss": 0.0191, "lr": 1.8700548465371877e-06, "epoch": 5.816216216216216, "percentage": 58.16, "elapsed_time": "2:42:46", "remaining_time": "1:57:05"} -{"current_steps": 1077, "total_steps": 1850, "loss": 0.1243, "lr": 1.8659473477343233e-06, "epoch": 5.821621621621622, "percentage": 58.22, "elapsed_time": "2:42:51", "remaining_time": "1:56:52"} -{"current_steps": 1078, "total_steps": 1850, "loss": 0.1457, "lr": 1.8618416773748032e-06, "epoch": 5.827027027027027, "percentage": 58.27, "elapsed_time": "2:42:53", "remaining_time": "1:56:39"} -{"current_steps": 1079, "total_steps": 1850, "loss": 0.0366, "lr": 1.8577378472983148e-06, "epoch": 5.832432432432433, "percentage": 58.32, "elapsed_time": "2:42:55", "remaining_time": "1:56:25"} -{"current_steps": 1080, "total_steps": 1850, "loss": 0.065, "lr": 1.8536358693392398e-06, "epoch": 5.837837837837838, "percentage": 58.38, "elapsed_time": "2:42:58", "remaining_time": "1:56:11"} -{"current_steps": 1081, "total_steps": 1850, "loss": 0.1902, "lr": 1.8495357553266176e-06, "epoch": 5.8432432432432435, "percentage": 58.43, "elapsed_time": "2:43:01", "remaining_time": "1:55:58"} -{"current_steps": 1082, "total_steps": 1850, "loss": 0.0372, "lr": 1.8454375170841133e-06, "epoch": 5.848648648648648, "percentage": 58.49, "elapsed_time": "2:43:05", "remaining_time": "1:55:45"} -{"current_steps": 1083, "total_steps": 1850, "loss": 0.0942, "lr": 1.841341166429983e-06, "epoch": 5.854054054054054, "percentage": 58.54, "elapsed_time": "2:43:07", "remaining_time": "1:55:31"} -{"current_steps": 1084, "total_steps": 1850, "loss": 0.2317, "lr": 1.8372467151770391e-06, "epoch": 5.859459459459459, "percentage": 58.59, "elapsed_time": "2:43:10", "remaining_time": "1:55:18"} -{"current_steps": 1085, "total_steps": 1850, "loss": 0.1935, "lr": 1.8331541751326168e-06, "epoch": 5.864864864864865, "percentage": 58.65, "elapsed_time": "2:43:14", "remaining_time": "1:55:05"} -{"current_steps": 1086, "total_steps": 1850, "loss": 0.0905, "lr": 1.8290635580985395e-06, "epoch": 5.87027027027027, "percentage": 58.7, "elapsed_time": "2:43:16", "remaining_time": "1:54:51"} -{"current_steps": 1087, "total_steps": 1850, "loss": 0.0931, "lr": 1.8249748758710856e-06, "epoch": 5.875675675675676, "percentage": 58.76, "elapsed_time": "2:43:22", "remaining_time": "1:54:40"} -{"current_steps": 1088, "total_steps": 1850, "loss": 0.0878, "lr": 1.8208881402409542e-06, "epoch": 5.881081081081081, "percentage": 58.81, "elapsed_time": "2:43:29", "remaining_time": "1:54:30"} -{"current_steps": 1089, "total_steps": 1850, "loss": 0.1317, "lr": 1.8168033629932296e-06, "epoch": 5.886486486486486, "percentage": 58.86, "elapsed_time": "2:43:32", "remaining_time": "1:54:16"} -{"current_steps": 1090, "total_steps": 1850, "loss": 0.027, "lr": 1.8127205559073507e-06, "epoch": 5.891891891891892, "percentage": 58.92, "elapsed_time": "2:43:34", "remaining_time": "1:54:03"} -{"current_steps": 1091, "total_steps": 1850, "loss": 0.0872, "lr": 1.8086397307570724e-06, "epoch": 5.897297297297297, "percentage": 58.97, "elapsed_time": "2:43:37", "remaining_time": "1:53:49"} -{"current_steps": 1092, "total_steps": 1850, "loss": 0.0821, "lr": 1.8045608993104373e-06, "epoch": 5.902702702702703, "percentage": 59.03, "elapsed_time": "2:43:39", "remaining_time": "1:53:36"} -{"current_steps": 1093, "total_steps": 1850, "loss": 0.0327, "lr": 1.8004840733297365e-06, "epoch": 5.908108108108108, "percentage": 59.08, "elapsed_time": "2:43:41", "remaining_time": "1:53:22"} -{"current_steps": 1094, "total_steps": 1850, "loss": 0.0497, "lr": 1.7964092645714777e-06, "epoch": 5.9135135135135135, "percentage": 59.14, "elapsed_time": "2:43:44", "remaining_time": "1:53:09"} -{"current_steps": 1095, "total_steps": 1850, "loss": 0.0307, "lr": 1.7923364847863527e-06, "epoch": 5.918918918918919, "percentage": 59.19, "elapsed_time": "2:43:46", "remaining_time": "1:52:55"} -{"current_steps": 1096, "total_steps": 1850, "loss": 0.0897, "lr": 1.7882657457192015e-06, "epoch": 5.924324324324324, "percentage": 59.24, "elapsed_time": "2:43:47", "remaining_time": "1:52:41"} -{"current_steps": 1097, "total_steps": 1850, "loss": 0.1545, "lr": 1.784197059108979e-06, "epoch": 5.92972972972973, "percentage": 59.3, "elapsed_time": "2:43:49", "remaining_time": "1:52:27"} -{"current_steps": 1098, "total_steps": 1850, "loss": 0.0509, "lr": 1.7801304366887235e-06, "epoch": 5.935135135135135, "percentage": 59.35, "elapsed_time": "2:43:52", "remaining_time": "1:52:14"} -{"current_steps": 1099, "total_steps": 1850, "loss": 0.0821, "lr": 1.776065890185517e-06, "epoch": 5.940540540540541, "percentage": 59.41, "elapsed_time": "2:43:53", "remaining_time": "1:51:59"} -{"current_steps": 1100, "total_steps": 1850, "loss": 0.0182, "lr": 1.7720034313204582e-06, "epoch": 5.945945945945946, "percentage": 59.46, "elapsed_time": "2:43:56", "remaining_time": "1:51:46"} -{"current_steps": 1101, "total_steps": 1850, "loss": 0.1027, "lr": 1.7679430718086244e-06, "epoch": 5.951351351351351, "percentage": 59.51, "elapsed_time": "2:44:01", "remaining_time": "1:51:35"} -{"current_steps": 1102, "total_steps": 1850, "loss": 0.0413, "lr": 1.763884823359038e-06, "epoch": 5.956756756756757, "percentage": 59.57, "elapsed_time": "2:44:04", "remaining_time": "1:51:21"} -{"current_steps": 1103, "total_steps": 1850, "loss": 0.1079, "lr": 1.759828697674636e-06, "epoch": 5.962162162162162, "percentage": 59.62, "elapsed_time": "2:44:06", "remaining_time": "1:51:08"} -{"current_steps": 1104, "total_steps": 1850, "loss": 0.0952, "lr": 1.7557747064522312e-06, "epoch": 5.967567567567568, "percentage": 59.68, "elapsed_time": "2:44:08", "remaining_time": "1:50:55"} -{"current_steps": 1105, "total_steps": 1850, "loss": 0.3393, "lr": 1.7517228613824836e-06, "epoch": 5.972972972972973, "percentage": 59.73, "elapsed_time": "2:44:11", "remaining_time": "1:50:41"} -{"current_steps": 1106, "total_steps": 1850, "loss": 0.0207, "lr": 1.747673174149862e-06, "epoch": 5.978378378378379, "percentage": 59.78, "elapsed_time": "2:44:15", "remaining_time": "1:50:29"} -{"current_steps": 1107, "total_steps": 1850, "loss": 0.1708, "lr": 1.743625656432615e-06, "epoch": 5.9837837837837835, "percentage": 59.84, "elapsed_time": "2:44:20", "remaining_time": "1:50:18"} -{"current_steps": 1108, "total_steps": 1850, "loss": 0.0569, "lr": 1.7395803199027325e-06, "epoch": 5.989189189189189, "percentage": 59.89, "elapsed_time": "2:44:25", "remaining_time": "1:50:06"} -{"current_steps": 1109, "total_steps": 1850, "loss": 0.0861, "lr": 1.7355371762259155e-06, "epoch": 5.994594594594595, "percentage": 59.95, "elapsed_time": "2:44:27", "remaining_time": "1:49:52"} -{"current_steps": 1110, "total_steps": 1850, "loss": 0.0571, "lr": 1.7314962370615423e-06, "epoch": 6.0, "percentage": 60.0, "elapsed_time": "2:44:34", "remaining_time": "1:49:42"} -{"current_steps": 1111, "total_steps": 1850, "loss": 0.0215, "lr": 1.7274575140626318e-06, "epoch": 6.005405405405406, "percentage": 60.05, "elapsed_time": "2:49:13", "remaining_time": "1:52:33"} -{"current_steps": 1112, "total_steps": 1850, "loss": 0.074, "lr": 1.7234210188758144e-06, "epoch": 6.010810810810811, "percentage": 60.11, "elapsed_time": "2:49:18", "remaining_time": "1:52:22"} -{"current_steps": 1113, "total_steps": 1850, "loss": 0.0481, "lr": 1.7193867631412942e-06, "epoch": 6.0162162162162165, "percentage": 60.16, "elapsed_time": "2:49:20", "remaining_time": "1:52:07"} -{"current_steps": 1114, "total_steps": 1850, "loss": 0.0253, "lr": 1.7153547584928185e-06, "epoch": 6.021621621621621, "percentage": 60.22, "elapsed_time": "2:49:22", "remaining_time": "1:51:54"} -{"current_steps": 1115, "total_steps": 1850, "loss": 0.0231, "lr": 1.7113250165576422e-06, "epoch": 6.027027027027027, "percentage": 60.27, "elapsed_time": "2:49:24", "remaining_time": "1:51:40"} -{"current_steps": 1116, "total_steps": 1850, "loss": 0.0517, "lr": 1.7072975489564958e-06, "epoch": 6.032432432432432, "percentage": 60.32, "elapsed_time": "2:49:27", "remaining_time": "1:51:26"} -{"current_steps": 1117, "total_steps": 1850, "loss": 0.0099, "lr": 1.703272367303551e-06, "epoch": 6.037837837837838, "percentage": 60.38, "elapsed_time": "2:49:30", "remaining_time": "1:51:13"} -{"current_steps": 1118, "total_steps": 1850, "loss": 0.0403, "lr": 1.6992494832063861e-06, "epoch": 6.043243243243243, "percentage": 60.43, "elapsed_time": "2:49:34", "remaining_time": "1:51:01"} -{"current_steps": 1119, "total_steps": 1850, "loss": 0.1234, "lr": 1.6952289082659568e-06, "epoch": 6.048648648648649, "percentage": 60.49, "elapsed_time": "2:49:36", "remaining_time": "1:50:47"} -{"current_steps": 1120, "total_steps": 1850, "loss": 0.0496, "lr": 1.6912106540765583e-06, "epoch": 6.054054054054054, "percentage": 60.54, "elapsed_time": "2:49:39", "remaining_time": "1:50:34"} -{"current_steps": 1121, "total_steps": 1850, "loss": 0.0725, "lr": 1.6871947322257915e-06, "epoch": 6.059459459459459, "percentage": 60.59, "elapsed_time": "2:49:42", "remaining_time": "1:50:21"} -{"current_steps": 1122, "total_steps": 1850, "loss": 0.0051, "lr": 1.6831811542945342e-06, "epoch": 6.064864864864865, "percentage": 60.65, "elapsed_time": "2:49:46", "remaining_time": "1:50:09"} -{"current_steps": 1123, "total_steps": 1850, "loss": 0.0135, "lr": 1.6791699318569039e-06, "epoch": 6.07027027027027, "percentage": 60.7, "elapsed_time": "2:49:49", "remaining_time": "1:49:56"} -{"current_steps": 1124, "total_steps": 1850, "loss": 0.031, "lr": 1.6751610764802246e-06, "epoch": 6.075675675675676, "percentage": 60.76, "elapsed_time": "2:49:51", "remaining_time": "1:49:43"} -{"current_steps": 1125, "total_steps": 1850, "loss": 0.0257, "lr": 1.6711545997249955e-06, "epoch": 6.081081081081081, "percentage": 60.81, "elapsed_time": "2:49:57", "remaining_time": "1:49:31"} -{"current_steps": 1126, "total_steps": 1850, "loss": 0.0733, "lr": 1.6671505131448562e-06, "epoch": 6.0864864864864865, "percentage": 60.86, "elapsed_time": "2:50:03", "remaining_time": "1:49:20"} -{"current_steps": 1127, "total_steps": 1850, "loss": 0.0113, "lr": 1.6631488282865537e-06, "epoch": 6.091891891891892, "percentage": 60.92, "elapsed_time": "2:50:05", "remaining_time": "1:49:07"} -{"current_steps": 1128, "total_steps": 1850, "loss": 0.0387, "lr": 1.6591495566899084e-06, "epoch": 6.097297297297297, "percentage": 60.97, "elapsed_time": "2:50:11", "remaining_time": "1:48:55"} -{"current_steps": 1129, "total_steps": 1850, "loss": 0.064, "lr": 1.6551527098877824e-06, "epoch": 6.102702702702703, "percentage": 61.03, "elapsed_time": "2:50:14", "remaining_time": "1:48:43"} -{"current_steps": 1130, "total_steps": 1850, "loss": 0.0676, "lr": 1.6511582994060443e-06, "epoch": 6.108108108108108, "percentage": 61.08, "elapsed_time": "2:50:17", "remaining_time": "1:48:30"} -{"current_steps": 1131, "total_steps": 1850, "loss": 0.1557, "lr": 1.6471663367635383e-06, "epoch": 6.113513513513514, "percentage": 61.14, "elapsed_time": "2:50:20", "remaining_time": "1:48:17"} -{"current_steps": 1132, "total_steps": 1850, "loss": 0.0117, "lr": 1.6431768334720486e-06, "epoch": 6.118918918918919, "percentage": 61.19, "elapsed_time": "2:50:24", "remaining_time": "1:48:04"} -{"current_steps": 1133, "total_steps": 1850, "loss": 0.0108, "lr": 1.6391898010362673e-06, "epoch": 6.124324324324324, "percentage": 61.24, "elapsed_time": "2:50:25", "remaining_time": "1:47:50"} -{"current_steps": 1134, "total_steps": 1850, "loss": 0.0165, "lr": 1.6352052509537623e-06, "epoch": 6.12972972972973, "percentage": 61.3, "elapsed_time": "2:50:28", "remaining_time": "1:47:38"} -{"current_steps": 1135, "total_steps": 1850, "loss": 0.0312, "lr": 1.6312231947149416e-06, "epoch": 6.135135135135135, "percentage": 61.35, "elapsed_time": "2:50:33", "remaining_time": "1:47:26"} -{"current_steps": 1136, "total_steps": 1850, "loss": 0.1021, "lr": 1.627243643803022e-06, "epoch": 6.140540540540541, "percentage": 61.41, "elapsed_time": "2:50:37", "remaining_time": "1:47:14"} -{"current_steps": 1137, "total_steps": 1850, "loss": 0.0387, "lr": 1.623266609693997e-06, "epoch": 6.145945945945946, "percentage": 61.46, "elapsed_time": "2:50:38", "remaining_time": "1:47:00"} -{"current_steps": 1138, "total_steps": 1850, "loss": 0.0973, "lr": 1.6192921038565993e-06, "epoch": 6.151351351351352, "percentage": 61.51, "elapsed_time": "2:50:41", "remaining_time": "1:46:47"} -{"current_steps": 1139, "total_steps": 1850, "loss": 0.0175, "lr": 1.615320137752274e-06, "epoch": 6.1567567567567565, "percentage": 61.57, "elapsed_time": "2:50:47", "remaining_time": "1:46:36"} -{"current_steps": 1140, "total_steps": 1850, "loss": 0.0072, "lr": 1.6113507228351411e-06, "epoch": 6.162162162162162, "percentage": 61.62, "elapsed_time": "2:50:49", "remaining_time": "1:46:23"} -{"current_steps": 1141, "total_steps": 1850, "loss": 0.0279, "lr": 1.6073838705519618e-06, "epoch": 6.167567567567567, "percentage": 61.68, "elapsed_time": "2:50:52", "remaining_time": "1:46:10"} -{"current_steps": 1142, "total_steps": 1850, "loss": 0.0171, "lr": 1.6034195923421106e-06, "epoch": 6.172972972972973, "percentage": 61.73, "elapsed_time": "2:50:56", "remaining_time": "1:45:58"} -{"current_steps": 1143, "total_steps": 1850, "loss": 0.0394, "lr": 1.5994578996375365e-06, "epoch": 6.178378378378379, "percentage": 61.78, "elapsed_time": "2:50:59", "remaining_time": "1:45:45"} -{"current_steps": 1144, "total_steps": 1850, "loss": 0.0218, "lr": 1.5954988038627327e-06, "epoch": 6.183783783783784, "percentage": 61.84, "elapsed_time": "2:51:01", "remaining_time": "1:45:32"} -{"current_steps": 1145, "total_steps": 1850, "loss": 0.0119, "lr": 1.5915423164347055e-06, "epoch": 6.1891891891891895, "percentage": 61.89, "elapsed_time": "2:51:03", "remaining_time": "1:45:19"} -{"current_steps": 1146, "total_steps": 1850, "loss": 0.0224, "lr": 1.5875884487629373e-06, "epoch": 6.194594594594594, "percentage": 61.95, "elapsed_time": "2:51:06", "remaining_time": "1:45:06"} -{"current_steps": 1147, "total_steps": 1850, "loss": 0.0156, "lr": 1.583637212249357e-06, "epoch": 6.2, "percentage": 62.0, "elapsed_time": "2:51:08", "remaining_time": "1:44:53"} -{"current_steps": 1148, "total_steps": 1850, "loss": 0.0761, "lr": 1.5796886182883053e-06, "epoch": 6.205405405405405, "percentage": 62.05, "elapsed_time": "2:51:11", "remaining_time": "1:44:41"} -{"current_steps": 1149, "total_steps": 1850, "loss": 0.1198, "lr": 1.575742678266503e-06, "epoch": 6.210810810810811, "percentage": 62.11, "elapsed_time": "2:51:15", "remaining_time": "1:44:29"} -{"current_steps": 1150, "total_steps": 1850, "loss": 0.0242, "lr": 1.5717994035630175e-06, "epoch": 6.216216216216216, "percentage": 62.16, "elapsed_time": "2:51:18", "remaining_time": "1:44:16"} -{"current_steps": 1151, "total_steps": 1850, "loss": 0.032, "lr": 1.5678588055492289e-06, "epoch": 6.221621621621622, "percentage": 62.22, "elapsed_time": "2:51:24", "remaining_time": "1:44:05"} -{"current_steps": 1152, "total_steps": 1850, "loss": 0.0376, "lr": 1.5639208955888008e-06, "epoch": 6.227027027027027, "percentage": 62.27, "elapsed_time": "2:51:30", "remaining_time": "1:43:54"} -{"current_steps": 1153, "total_steps": 1850, "loss": 0.0203, "lr": 1.5599856850376427e-06, "epoch": 6.232432432432432, "percentage": 62.32, "elapsed_time": "2:51:33", "remaining_time": "1:43:42"} -{"current_steps": 1154, "total_steps": 1850, "loss": 0.0316, "lr": 1.556053185243882e-06, "epoch": 6.237837837837838, "percentage": 62.38, "elapsed_time": "2:51:35", "remaining_time": "1:43:29"} -{"current_steps": 1155, "total_steps": 1850, "loss": 0.0068, "lr": 1.5521234075478264e-06, "epoch": 6.243243243243243, "percentage": 62.43, "elapsed_time": "2:51:38", "remaining_time": "1:43:16"} -{"current_steps": 1156, "total_steps": 1850, "loss": 0.0104, "lr": 1.548196363281937e-06, "epoch": 6.248648648648649, "percentage": 62.49, "elapsed_time": "2:51:39", "remaining_time": "1:43:03"} -{"current_steps": 1157, "total_steps": 1850, "loss": 0.0146, "lr": 1.5442720637707891e-06, "epoch": 6.254054054054054, "percentage": 62.54, "elapsed_time": "2:51:41", "remaining_time": "1:42:50"} -{"current_steps": 1158, "total_steps": 1850, "loss": 0.0298, "lr": 1.5403505203310442e-06, "epoch": 6.2594594594594595, "percentage": 62.59, "elapsed_time": "2:51:43", "remaining_time": "1:42:37"} -{"current_steps": 1159, "total_steps": 1850, "loss": 0.0238, "lr": 1.536431744271417e-06, "epoch": 6.264864864864865, "percentage": 62.65, "elapsed_time": "2:51:48", "remaining_time": "1:42:25"} -{"current_steps": 1160, "total_steps": 1850, "loss": 0.0125, "lr": 1.5325157468926415e-06, "epoch": 6.27027027027027, "percentage": 62.7, "elapsed_time": "2:51:50", "remaining_time": "1:42:13"} -{"current_steps": 1161, "total_steps": 1850, "loss": 0.0557, "lr": 1.5286025394874366e-06, "epoch": 6.275675675675676, "percentage": 62.76, "elapsed_time": "2:51:54", "remaining_time": "1:42:00"} -{"current_steps": 1162, "total_steps": 1850, "loss": 0.0271, "lr": 1.5246921333404786e-06, "epoch": 6.281081081081081, "percentage": 62.81, "elapsed_time": "2:51:56", "remaining_time": "1:41:47"} -{"current_steps": 1163, "total_steps": 1850, "loss": 0.0209, "lr": 1.520784539728363e-06, "epoch": 6.286486486486487, "percentage": 62.86, "elapsed_time": "2:52:03", "remaining_time": "1:41:38"} -{"current_steps": 1164, "total_steps": 1850, "loss": 0.0683, "lr": 1.5168797699195765e-06, "epoch": 6.291891891891892, "percentage": 62.92, "elapsed_time": "2:52:06", "remaining_time": "1:41:25"} -{"current_steps": 1165, "total_steps": 1850, "loss": 0.0881, "lr": 1.5129778351744622e-06, "epoch": 6.297297297297297, "percentage": 62.97, "elapsed_time": "2:52:09", "remaining_time": "1:41:13"} -{"current_steps": 1166, "total_steps": 1850, "loss": 0.0093, "lr": 1.5090787467451873e-06, "epoch": 6.302702702702703, "percentage": 63.03, "elapsed_time": "2:52:10", "remaining_time": "1:40:59"} -{"current_steps": 1167, "total_steps": 1850, "loss": 0.054, "lr": 1.5051825158757116e-06, "epoch": 6.308108108108108, "percentage": 63.08, "elapsed_time": "2:52:12", "remaining_time": "1:40:47"} -{"current_steps": 1168, "total_steps": 1850, "loss": 0.0498, "lr": 1.5012891538017538e-06, "epoch": 6.313513513513514, "percentage": 63.14, "elapsed_time": "2:52:18", "remaining_time": "1:40:36"} -{"current_steps": 1169, "total_steps": 1850, "loss": 0.0601, "lr": 1.49739867175076e-06, "epoch": 6.318918918918919, "percentage": 63.19, "elapsed_time": "2:52:21", "remaining_time": "1:40:24"} -{"current_steps": 1170, "total_steps": 1850, "loss": 0.0119, "lr": 1.4935110809418713e-06, "epoch": 6.324324324324325, "percentage": 63.24, "elapsed_time": "2:52:25", "remaining_time": "1:40:12"} -{"current_steps": 1171, "total_steps": 1850, "loss": 0.1361, "lr": 1.4896263925858903e-06, "epoch": 6.3297297297297295, "percentage": 63.3, "elapsed_time": "2:52:27", "remaining_time": "1:40:00"} -{"current_steps": 1172, "total_steps": 1850, "loss": 0.0738, "lr": 1.485744617885251e-06, "epoch": 6.335135135135135, "percentage": 63.35, "elapsed_time": "2:52:30", "remaining_time": "1:39:47"} -{"current_steps": 1173, "total_steps": 1850, "loss": 0.0392, "lr": 1.481865768033984e-06, "epoch": 6.34054054054054, "percentage": 63.41, "elapsed_time": "2:52:32", "remaining_time": "1:39:34"} -{"current_steps": 1174, "total_steps": 1850, "loss": 0.0158, "lr": 1.4779898542176865e-06, "epoch": 6.345945945945946, "percentage": 63.46, "elapsed_time": "2:52:36", "remaining_time": "1:39:23"} -{"current_steps": 1175, "total_steps": 1850, "loss": 0.0552, "lr": 1.4741168876134875e-06, "epoch": 6.351351351351352, "percentage": 63.51, "elapsed_time": "2:52:39", "remaining_time": "1:39:11"} -{"current_steps": 1176, "total_steps": 1850, "loss": 0.039, "lr": 1.4702468793900187e-06, "epoch": 6.356756756756757, "percentage": 63.57, "elapsed_time": "2:52:40", "remaining_time": "1:38:57"} -{"current_steps": 1177, "total_steps": 1850, "loss": 0.008, "lr": 1.4663798407073799e-06, "epoch": 6.3621621621621625, "percentage": 63.62, "elapsed_time": "2:52:42", "remaining_time": "1:38:45"} -{"current_steps": 1178, "total_steps": 1850, "loss": 0.0105, "lr": 1.4625157827171056e-06, "epoch": 6.367567567567567, "percentage": 63.68, "elapsed_time": "2:52:43", "remaining_time": "1:38:32"} -{"current_steps": 1179, "total_steps": 1850, "loss": 0.0161, "lr": 1.4586547165621385e-06, "epoch": 6.372972972972973, "percentage": 63.73, "elapsed_time": "2:52:45", "remaining_time": "1:38:19"} -{"current_steps": 1180, "total_steps": 1850, "loss": 0.0319, "lr": 1.4547966533767904e-06, "epoch": 6.378378378378378, "percentage": 63.78, "elapsed_time": "2:52:48", "remaining_time": "1:38:07"} -{"current_steps": 1181, "total_steps": 1850, "loss": 0.0389, "lr": 1.450941604286715e-06, "epoch": 6.383783783783784, "percentage": 63.84, "elapsed_time": "2:52:55", "remaining_time": "1:37:57"} -{"current_steps": 1182, "total_steps": 1850, "loss": 0.0585, "lr": 1.4470895804088736e-06, "epoch": 6.389189189189189, "percentage": 63.89, "elapsed_time": "2:52:58", "remaining_time": "1:37:45"} -{"current_steps": 1183, "total_steps": 1850, "loss": 0.011, "lr": 1.443240592851505e-06, "epoch": 6.394594594594595, "percentage": 63.95, "elapsed_time": "2:53:02", "remaining_time": "1:37:33"} -{"current_steps": 1184, "total_steps": 1850, "loss": 0.0237, "lr": 1.4393946527140884e-06, "epoch": 6.4, "percentage": 64.0, "elapsed_time": "2:53:03", "remaining_time": "1:37:20"} -{"current_steps": 1185, "total_steps": 1850, "loss": 0.047, "lr": 1.4355517710873184e-06, "epoch": 6.405405405405405, "percentage": 64.05, "elapsed_time": "2:53:06", "remaining_time": "1:37:08"} -{"current_steps": 1186, "total_steps": 1850, "loss": 0.0315, "lr": 1.4317119590530692e-06, "epoch": 6.410810810810811, "percentage": 64.11, "elapsed_time": "2:53:11", "remaining_time": "1:36:57"} -{"current_steps": 1187, "total_steps": 1850, "loss": 0.0301, "lr": 1.427875227684361e-06, "epoch": 6.416216216216216, "percentage": 64.16, "elapsed_time": "2:53:13", "remaining_time": "1:36:45"} -{"current_steps": 1188, "total_steps": 1850, "loss": 0.0357, "lr": 1.4240415880453327e-06, "epoch": 6.421621621621622, "percentage": 64.22, "elapsed_time": "2:53:16", "remaining_time": "1:36:33"} -{"current_steps": 1189, "total_steps": 1850, "loss": 0.0397, "lr": 1.420211051191206e-06, "epoch": 6.427027027027027, "percentage": 64.27, "elapsed_time": "2:53:18", "remaining_time": "1:36:20"} -{"current_steps": 1190, "total_steps": 1850, "loss": 0.0092, "lr": 1.4163836281682563e-06, "epoch": 6.4324324324324325, "percentage": 64.32, "elapsed_time": "2:53:20", "remaining_time": "1:36:08"} -{"current_steps": 1191, "total_steps": 1850, "loss": 0.106, "lr": 1.4125593300137767e-06, "epoch": 6.437837837837838, "percentage": 64.38, "elapsed_time": "2:53:23", "remaining_time": "1:35:56"} -{"current_steps": 1192, "total_steps": 1850, "loss": 0.0087, "lr": 1.4087381677560519e-06, "epoch": 6.443243243243243, "percentage": 64.43, "elapsed_time": "2:53:24", "remaining_time": "1:35:43"} -{"current_steps": 1193, "total_steps": 1850, "loss": 0.016, "lr": 1.4049201524143236e-06, "epoch": 6.448648648648649, "percentage": 64.49, "elapsed_time": "2:53:27", "remaining_time": "1:35:31"} -{"current_steps": 1194, "total_steps": 1850, "loss": 0.0065, "lr": 1.401105294998755e-06, "epoch": 6.454054054054054, "percentage": 64.54, "elapsed_time": "2:53:29", "remaining_time": "1:35:19"} -{"current_steps": 1195, "total_steps": 1850, "loss": 0.0248, "lr": 1.3972936065104064e-06, "epoch": 6.45945945945946, "percentage": 64.59, "elapsed_time": "2:53:34", "remaining_time": "1:35:08"} -{"current_steps": 1196, "total_steps": 1850, "loss": 0.0436, "lr": 1.393485097941199e-06, "epoch": 6.464864864864865, "percentage": 64.65, "elapsed_time": "2:53:36", "remaining_time": "1:34:56"} -{"current_steps": 1197, "total_steps": 1850, "loss": 0.0755, "lr": 1.3896797802738832e-06, "epoch": 6.47027027027027, "percentage": 64.7, "elapsed_time": "2:53:38", "remaining_time": "1:34:43"} -{"current_steps": 1198, "total_steps": 1850, "loss": 0.0194, "lr": 1.385877664482006e-06, "epoch": 6.475675675675676, "percentage": 64.76, "elapsed_time": "2:53:39", "remaining_time": "1:34:30"} -{"current_steps": 1199, "total_steps": 1850, "loss": 0.0416, "lr": 1.382078761529886e-06, "epoch": 6.481081081081081, "percentage": 64.81, "elapsed_time": "2:53:44", "remaining_time": "1:34:19"} -{"current_steps": 1200, "total_steps": 1850, "loss": 0.0678, "lr": 1.3782830823725713e-06, "epoch": 6.486486486486487, "percentage": 64.86, "elapsed_time": "2:53:47", "remaining_time": "1:34:08"} -{"current_steps": 1201, "total_steps": 1850, "loss": 0.0073, "lr": 1.3744906379558165e-06, "epoch": 6.491891891891892, "percentage": 64.92, "elapsed_time": "2:53:50", "remaining_time": "1:33:56"} -{"current_steps": 1202, "total_steps": 1850, "loss": 0.0436, "lr": 1.3707014392160477e-06, "epoch": 6.4972972972972975, "percentage": 64.97, "elapsed_time": "2:53:53", "remaining_time": "1:33:44"} -{"current_steps": 1203, "total_steps": 1850, "loss": 0.0087, "lr": 1.3669154970803313e-06, "epoch": 6.5027027027027025, "percentage": 65.03, "elapsed_time": "2:53:55", "remaining_time": "1:33:32"} -{"current_steps": 1204, "total_steps": 1850, "loss": 0.0483, "lr": 1.363132822466341e-06, "epoch": 6.508108108108108, "percentage": 65.08, "elapsed_time": "2:53:57", "remaining_time": "1:33:20"} -{"current_steps": 1205, "total_steps": 1850, "loss": 0.0147, "lr": 1.3593534262823289e-06, "epoch": 6.513513513513513, "percentage": 65.14, "elapsed_time": "2:53:59", "remaining_time": "1:33:08"} -{"current_steps": 1206, "total_steps": 1850, "loss": 0.0107, "lr": 1.355577319427095e-06, "epoch": 6.518918918918919, "percentage": 65.19, "elapsed_time": "2:54:01", "remaining_time": "1:32:55"} -{"current_steps": 1207, "total_steps": 1850, "loss": 0.0442, "lr": 1.3518045127899493e-06, "epoch": 6.524324324324324, "percentage": 65.24, "elapsed_time": "2:54:04", "remaining_time": "1:32:44"} -{"current_steps": 1208, "total_steps": 1850, "loss": 0.047, "lr": 1.3480350172506884e-06, "epoch": 6.52972972972973, "percentage": 65.3, "elapsed_time": "2:54:08", "remaining_time": "1:32:32"} -{"current_steps": 1209, "total_steps": 1850, "loss": 0.0341, "lr": 1.3442688436795592e-06, "epoch": 6.535135135135135, "percentage": 65.35, "elapsed_time": "2:54:10", "remaining_time": "1:32:20"} -{"current_steps": 1210, "total_steps": 1850, "loss": 0.0082, "lr": 1.3405060029372308e-06, "epoch": 6.54054054054054, "percentage": 65.41, "elapsed_time": "2:54:12", "remaining_time": "1:32:08"} -{"current_steps": 1211, "total_steps": 1850, "loss": 0.0223, "lr": 1.3367465058747566e-06, "epoch": 6.545945945945946, "percentage": 65.46, "elapsed_time": "2:54:13", "remaining_time": "1:31:55"} -{"current_steps": 1212, "total_steps": 1850, "loss": 0.011, "lr": 1.3329903633335528e-06, "epoch": 6.551351351351351, "percentage": 65.51, "elapsed_time": "2:54:15", "remaining_time": "1:31:43"} -{"current_steps": 1213, "total_steps": 1850, "loss": 0.0208, "lr": 1.3292375861453598e-06, "epoch": 6.556756756756757, "percentage": 65.57, "elapsed_time": "2:54:20", "remaining_time": "1:31:33"} -{"current_steps": 1214, "total_steps": 1850, "loss": 0.023, "lr": 1.3254881851322126e-06, "epoch": 6.562162162162162, "percentage": 65.62, "elapsed_time": "2:54:21", "remaining_time": "1:31:20"} -{"current_steps": 1215, "total_steps": 1850, "loss": 0.019, "lr": 1.3217421711064112e-06, "epoch": 6.5675675675675675, "percentage": 65.68, "elapsed_time": "2:54:24", "remaining_time": "1:31:09"} -{"current_steps": 1216, "total_steps": 1850, "loss": 0.0803, "lr": 1.3179995548704883e-06, "epoch": 6.572972972972973, "percentage": 65.73, "elapsed_time": "2:54:30", "remaining_time": "1:30:59"} -{"current_steps": 1217, "total_steps": 1850, "loss": 0.0941, "lr": 1.314260347217179e-06, "epoch": 6.578378378378378, "percentage": 65.78, "elapsed_time": "2:54:32", "remaining_time": "1:30:47"} -{"current_steps": 1218, "total_steps": 1850, "loss": 0.0516, "lr": 1.3105245589293852e-06, "epoch": 6.583783783783784, "percentage": 65.84, "elapsed_time": "2:54:35", "remaining_time": "1:30:35"} -{"current_steps": 1219, "total_steps": 1850, "loss": 0.0603, "lr": 1.3067922007801548e-06, "epoch": 6.589189189189189, "percentage": 65.89, "elapsed_time": "2:54:41", "remaining_time": "1:30:25"} -{"current_steps": 1220, "total_steps": 1850, "loss": 0.0797, "lr": 1.3030632835326378e-06, "epoch": 6.594594594594595, "percentage": 65.95, "elapsed_time": "2:54:45", "remaining_time": "1:30:14"} -{"current_steps": 1221, "total_steps": 1850, "loss": 0.0316, "lr": 1.2993378179400645e-06, "epoch": 6.6, "percentage": 66.0, "elapsed_time": "2:54:50", "remaining_time": "1:30:04"} -{"current_steps": 1222, "total_steps": 1850, "loss": 0.0553, "lr": 1.2956158147457116e-06, "epoch": 6.605405405405405, "percentage": 66.05, "elapsed_time": "2:54:54", "remaining_time": "1:29:53"} -{"current_steps": 1223, "total_steps": 1850, "loss": 0.0438, "lr": 1.2918972846828711e-06, "epoch": 6.610810810810811, "percentage": 66.11, "elapsed_time": "2:54:59", "remaining_time": "1:29:42"} -{"current_steps": 1224, "total_steps": 1850, "loss": 0.0781, "lr": 1.2881822384748176e-06, "epoch": 6.616216216216216, "percentage": 66.16, "elapsed_time": "2:55:01", "remaining_time": "1:29:30"} -{"current_steps": 1225, "total_steps": 1850, "loss": 0.3345, "lr": 1.2844706868347812e-06, "epoch": 6.621621621621622, "percentage": 66.22, "elapsed_time": "2:55:05", "remaining_time": "1:29:19"} -{"current_steps": 1226, "total_steps": 1850, "loss": 0.0915, "lr": 1.2807626404659144e-06, "epoch": 6.627027027027027, "percentage": 66.27, "elapsed_time": "2:55:08", "remaining_time": "1:29:08"} -{"current_steps": 1227, "total_steps": 1850, "loss": 0.0882, "lr": 1.2770581100612594e-06, "epoch": 6.632432432432433, "percentage": 66.32, "elapsed_time": "2:55:10", "remaining_time": "1:28:56"} -{"current_steps": 1228, "total_steps": 1850, "loss": 0.0297, "lr": 1.2733571063037214e-06, "epoch": 6.6378378378378375, "percentage": 66.38, "elapsed_time": "2:55:15", "remaining_time": "1:28:46"} -{"current_steps": 1229, "total_steps": 1850, "loss": 0.0506, "lr": 1.2696596398660358e-06, "epoch": 6.643243243243243, "percentage": 66.43, "elapsed_time": "2:55:16", "remaining_time": "1:28:33"} -{"current_steps": 1230, "total_steps": 1850, "loss": 0.0794, "lr": 1.2659657214107365e-06, "epoch": 6.648648648648649, "percentage": 66.49, "elapsed_time": "2:55:18", "remaining_time": "1:28:21"} -{"current_steps": 1231, "total_steps": 1850, "loss": 0.0145, "lr": 1.2622753615901245e-06, "epoch": 6.654054054054054, "percentage": 66.54, "elapsed_time": "2:55:22", "remaining_time": "1:28:11"} -{"current_steps": 1232, "total_steps": 1850, "loss": 0.0064, "lr": 1.2585885710462409e-06, "epoch": 6.65945945945946, "percentage": 66.59, "elapsed_time": "2:55:23", "remaining_time": "1:27:58"} -{"current_steps": 1233, "total_steps": 1850, "loss": 0.0765, "lr": 1.254905360410834e-06, "epoch": 6.664864864864865, "percentage": 66.65, "elapsed_time": "2:55:26", "remaining_time": "1:27:47"} -{"current_steps": 1234, "total_steps": 1850, "loss": 0.0092, "lr": 1.2512257403053257e-06, "epoch": 6.6702702702702705, "percentage": 66.7, "elapsed_time": "2:55:29", "remaining_time": "1:27:36"} -{"current_steps": 1235, "total_steps": 1850, "loss": 0.0183, "lr": 1.247549721340787e-06, "epoch": 6.675675675675675, "percentage": 66.76, "elapsed_time": "2:55:31", "remaining_time": "1:27:24"} -{"current_steps": 1236, "total_steps": 1850, "loss": 0.018, "lr": 1.2438773141179025e-06, "epoch": 6.681081081081081, "percentage": 66.81, "elapsed_time": "2:55:33", "remaining_time": "1:27:12"} -{"current_steps": 1237, "total_steps": 1850, "loss": 0.0123, "lr": 1.2402085292269427e-06, "epoch": 6.686486486486486, "percentage": 66.86, "elapsed_time": "2:55:37", "remaining_time": "1:27:01"} -{"current_steps": 1238, "total_steps": 1850, "loss": 0.0084, "lr": 1.236543377247729e-06, "epoch": 6.691891891891892, "percentage": 66.92, "elapsed_time": "2:55:41", "remaining_time": "1:26:51"} -{"current_steps": 1239, "total_steps": 1850, "loss": 0.009, "lr": 1.232881868749611e-06, "epoch": 6.697297297297297, "percentage": 66.97, "elapsed_time": "2:55:44", "remaining_time": "1:26:39"} -{"current_steps": 1240, "total_steps": 1850, "loss": 0.0087, "lr": 1.22922401429143e-06, "epoch": 6.702702702702703, "percentage": 67.03, "elapsed_time": "2:55:46", "remaining_time": "1:26:28"} -{"current_steps": 1241, "total_steps": 1850, "loss": 0.0331, "lr": 1.2255698244214863e-06, "epoch": 6.708108108108108, "percentage": 67.08, "elapsed_time": "2:55:48", "remaining_time": "1:26:16"} -{"current_steps": 1242, "total_steps": 1850, "loss": 0.0326, "lr": 1.2219193096775173e-06, "epoch": 6.713513513513513, "percentage": 67.14, "elapsed_time": "2:55:55", "remaining_time": "1:26:07"} -{"current_steps": 1243, "total_steps": 1850, "loss": 0.0088, "lr": 1.218272480586661e-06, "epoch": 6.718918918918919, "percentage": 67.19, "elapsed_time": "2:55:59", "remaining_time": "1:25:56"} -{"current_steps": 1244, "total_steps": 1850, "loss": 0.0529, "lr": 1.2146293476654242e-06, "epoch": 6.724324324324324, "percentage": 67.24, "elapsed_time": "2:56:06", "remaining_time": "1:25:47"} -{"current_steps": 1245, "total_steps": 1850, "loss": 0.0589, "lr": 1.2109899214196583e-06, "epoch": 6.72972972972973, "percentage": 67.3, "elapsed_time": "2:56:08", "remaining_time": "1:25:35"} -{"current_steps": 1246, "total_steps": 1850, "loss": 0.0138, "lr": 1.2073542123445239e-06, "epoch": 6.735135135135135, "percentage": 67.35, "elapsed_time": "2:56:10", "remaining_time": "1:25:24"} -{"current_steps": 1247, "total_steps": 1850, "loss": 0.0266, "lr": 1.2037222309244642e-06, "epoch": 6.7405405405405405, "percentage": 67.41, "elapsed_time": "2:56:12", "remaining_time": "1:25:12"} -{"current_steps": 1248, "total_steps": 1850, "loss": 0.0334, "lr": 1.200093987633169e-06, "epoch": 6.745945945945946, "percentage": 67.46, "elapsed_time": "2:56:14", "remaining_time": "1:25:00"} -{"current_steps": 1249, "total_steps": 1850, "loss": 0.0503, "lr": 1.1964694929335518e-06, "epoch": 6.751351351351351, "percentage": 67.51, "elapsed_time": "2:56:17", "remaining_time": "1:24:49"} -{"current_steps": 1250, "total_steps": 1850, "loss": 0.0672, "lr": 1.1928487572777158e-06, "epoch": 6.756756756756757, "percentage": 67.57, "elapsed_time": "2:56:20", "remaining_time": "1:24:38"} -{"current_steps": 1251, "total_steps": 1850, "loss": 0.1029, "lr": 1.1892317911069212e-06, "epoch": 6.762162162162162, "percentage": 67.62, "elapsed_time": "2:56:23", "remaining_time": "1:24:27"} -{"current_steps": 1252, "total_steps": 1850, "loss": 0.0151, "lr": 1.185618604851561e-06, "epoch": 6.767567567567568, "percentage": 67.68, "elapsed_time": "2:56:25", "remaining_time": "1:24:15"} -{"current_steps": 1253, "total_steps": 1850, "loss": 0.0179, "lr": 1.182009208931128e-06, "epoch": 6.772972972972973, "percentage": 67.73, "elapsed_time": "2:56:27", "remaining_time": "1:24:04"} -{"current_steps": 1254, "total_steps": 1850, "loss": 0.0079, "lr": 1.178403613754182e-06, "epoch": 6.778378378378378, "percentage": 67.78, "elapsed_time": "2:56:27", "remaining_time": "1:23:52"} -{"current_steps": 1255, "total_steps": 1850, "loss": 0.0194, "lr": 1.1748018297183239e-06, "epoch": 6.783783783783784, "percentage": 67.84, "elapsed_time": "2:56:31", "remaining_time": "1:23:41"} -{"current_steps": 1256, "total_steps": 1850, "loss": 0.024, "lr": 1.1712038672101654e-06, "epoch": 6.789189189189189, "percentage": 67.89, "elapsed_time": "2:56:37", "remaining_time": "1:23:31"} -{"current_steps": 1257, "total_steps": 1850, "loss": 0.1671, "lr": 1.1676097366052974e-06, "epoch": 6.794594594594595, "percentage": 67.95, "elapsed_time": "2:56:39", "remaining_time": "1:23:20"} -{"current_steps": 1258, "total_steps": 1850, "loss": 0.0644, "lr": 1.1640194482682573e-06, "epoch": 6.8, "percentage": 68.0, "elapsed_time": "2:56:42", "remaining_time": "1:23:09"} -{"current_steps": 1259, "total_steps": 1850, "loss": 0.0589, "lr": 1.160433012552508e-06, "epoch": 6.805405405405406, "percentage": 68.05, "elapsed_time": "2:56:45", "remaining_time": "1:22:58"} -{"current_steps": 1260, "total_steps": 1850, "loss": 0.161, "lr": 1.1568504398003995e-06, "epoch": 6.8108108108108105, "percentage": 68.11, "elapsed_time": "2:56:50", "remaining_time": "1:22:48"} -{"current_steps": 1261, "total_steps": 1850, "loss": 0.0429, "lr": 1.1532717403431405e-06, "epoch": 6.816216216216216, "percentage": 68.16, "elapsed_time": "2:56:55", "remaining_time": "1:22:38"} -{"current_steps": 1262, "total_steps": 1850, "loss": 0.0568, "lr": 1.1496969245007723e-06, "epoch": 6.821621621621622, "percentage": 68.22, "elapsed_time": "2:57:02", "remaining_time": "1:22:29"} -{"current_steps": 1263, "total_steps": 1850, "loss": 0.1732, "lr": 1.1461260025821374e-06, "epoch": 6.827027027027027, "percentage": 68.27, "elapsed_time": "2:57:05", "remaining_time": "1:22:18"} -{"current_steps": 1264, "total_steps": 1850, "loss": 0.0105, "lr": 1.1425589848848464e-06, "epoch": 6.832432432432433, "percentage": 68.32, "elapsed_time": "2:57:07", "remaining_time": "1:22:06"} -{"current_steps": 1265, "total_steps": 1850, "loss": 0.0379, "lr": 1.1389958816952538e-06, "epoch": 6.837837837837838, "percentage": 68.38, "elapsed_time": "2:57:09", "remaining_time": "1:21:55"} -{"current_steps": 1266, "total_steps": 1850, "loss": 0.0409, "lr": 1.1354367032884245e-06, "epoch": 6.8432432432432435, "percentage": 68.43, "elapsed_time": "2:57:12", "remaining_time": "1:21:44"} -{"current_steps": 1267, "total_steps": 1850, "loss": 0.0282, "lr": 1.131881459928107e-06, "epoch": 6.848648648648648, "percentage": 68.49, "elapsed_time": "2:57:15", "remaining_time": "1:21:34"} -{"current_steps": 1268, "total_steps": 1850, "loss": 0.0177, "lr": 1.128330161866698e-06, "epoch": 6.854054054054054, "percentage": 68.54, "elapsed_time": "2:57:18", "remaining_time": "1:21:23"} -{"current_steps": 1269, "total_steps": 1850, "loss": 0.0112, "lr": 1.1247828193452215e-06, "epoch": 6.859459459459459, "percentage": 68.59, "elapsed_time": "2:57:22", "remaining_time": "1:21:12"} -{"current_steps": 1270, "total_steps": 1850, "loss": 0.0496, "lr": 1.1212394425932937e-06, "epoch": 6.864864864864865, "percentage": 68.65, "elapsed_time": "2:57:24", "remaining_time": "1:21:01"} -{"current_steps": 1271, "total_steps": 1850, "loss": 0.0737, "lr": 1.1177000418290917e-06, "epoch": 6.87027027027027, "percentage": 68.7, "elapsed_time": "2:57:29", "remaining_time": "1:20:51"} -{"current_steps": 1272, "total_steps": 1850, "loss": 0.0204, "lr": 1.1141646272593303e-06, "epoch": 6.875675675675676, "percentage": 68.76, "elapsed_time": "2:57:32", "remaining_time": "1:20:40"} -{"current_steps": 1273, "total_steps": 1850, "loss": 0.0126, "lr": 1.1106332090792273e-06, "epoch": 6.881081081081081, "percentage": 68.81, "elapsed_time": "2:57:33", "remaining_time": "1:20:28"} -{"current_steps": 1274, "total_steps": 1850, "loss": 0.0451, "lr": 1.1071057974724783e-06, "epoch": 6.886486486486486, "percentage": 68.86, "elapsed_time": "2:57:37", "remaining_time": "1:20:18"} -{"current_steps": 1275, "total_steps": 1850, "loss": 0.0876, "lr": 1.1035824026112205e-06, "epoch": 6.891891891891892, "percentage": 68.92, "elapsed_time": "2:57:43", "remaining_time": "1:20:09"} -{"current_steps": 1276, "total_steps": 1850, "loss": 0.017, "lr": 1.1000630346560118e-06, "epoch": 6.897297297297297, "percentage": 68.97, "elapsed_time": "2:57:46", "remaining_time": "1:19:58"} -{"current_steps": 1277, "total_steps": 1850, "loss": 0.0199, "lr": 1.0965477037557973e-06, "epoch": 6.902702702702703, "percentage": 69.03, "elapsed_time": "2:57:52", "remaining_time": "1:19:48"} -{"current_steps": 1278, "total_steps": 1850, "loss": 0.0154, "lr": 1.093036420047876e-06, "epoch": 6.908108108108108, "percentage": 69.08, "elapsed_time": "2:57:54", "remaining_time": "1:19:37"} -{"current_steps": 1279, "total_steps": 1850, "loss": 0.0101, "lr": 1.0895291936578825e-06, "epoch": 6.9135135135135135, "percentage": 69.14, "elapsed_time": "2:57:58", "remaining_time": "1:19:27"} -{"current_steps": 1280, "total_steps": 1850, "loss": 0.0145, "lr": 1.0860260346997475e-06, "epoch": 6.918918918918919, "percentage": 69.19, "elapsed_time": "2:58:03", "remaining_time": "1:19:17"} -{"current_steps": 1281, "total_steps": 1850, "loss": 0.0616, "lr": 1.0825269532756707e-06, "epoch": 6.924324324324324, "percentage": 69.24, "elapsed_time": "2:58:05", "remaining_time": "1:19:06"} -{"current_steps": 1282, "total_steps": 1850, "loss": 0.0563, "lr": 1.079031959476096e-06, "epoch": 6.92972972972973, "percentage": 69.3, "elapsed_time": "2:58:12", "remaining_time": "1:18:57"} -{"current_steps": 1283, "total_steps": 1850, "loss": 0.0091, "lr": 1.0755410633796799e-06, "epoch": 6.935135135135135, "percentage": 69.35, "elapsed_time": "2:58:12", "remaining_time": "1:18:45"} -{"current_steps": 1284, "total_steps": 1850, "loss": 0.0241, "lr": 1.0720542750532584e-06, "epoch": 6.940540540540541, "percentage": 69.41, "elapsed_time": "2:58:13", "remaining_time": "1:18:34"} -{"current_steps": 1285, "total_steps": 1850, "loss": 0.0289, "lr": 1.0685716045518262e-06, "epoch": 6.945945945945946, "percentage": 69.46, "elapsed_time": "2:58:15", "remaining_time": "1:18:22"} -{"current_steps": 1286, "total_steps": 1850, "loss": 0.0827, "lr": 1.065093061918501e-06, "epoch": 6.951351351351351, "percentage": 69.51, "elapsed_time": "2:58:20", "remaining_time": "1:18:12"} -{"current_steps": 1287, "total_steps": 1850, "loss": 0.0307, "lr": 1.0616186571844983e-06, "epoch": 6.956756756756757, "percentage": 69.57, "elapsed_time": "2:58:21", "remaining_time": "1:18:01"} -{"current_steps": 1288, "total_steps": 1850, "loss": 0.1038, "lr": 1.058148400369098e-06, "epoch": 6.962162162162162, "percentage": 69.62, "elapsed_time": "2:58:24", "remaining_time": "1:17:50"} -{"current_steps": 1289, "total_steps": 1850, "loss": 0.0097, "lr": 1.0546823014796215e-06, "epoch": 6.967567567567568, "percentage": 69.68, "elapsed_time": "2:58:27", "remaining_time": "1:17:40"} -{"current_steps": 1290, "total_steps": 1850, "loss": 0.01, "lr": 1.051220370511399e-06, "epoch": 6.972972972972973, "percentage": 69.73, "elapsed_time": "2:58:30", "remaining_time": "1:17:29"}